{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20905038860974026, "eval_steps": 1000, "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.4932170614981446e-05, "grad_norm": 81.5, "learning_rate": 0.0001, "loss": 5.4834, "loss/crossentropy": 3.0311607122421265, "loss/fcd": 4.953125, "loss/idx": 0.0, "loss/logits": 0.5302902162075043, "step": 1 }, { "epoch": 2.9864341229962893e-05, "grad_norm": 77.0, "learning_rate": 0.0001, "loss": 5.4138, "loss/crossentropy": 2.7625315189361572, "loss/fcd": 4.921875, "loss/idx": 0.0, "loss/logits": 0.4918830245733261, "step": 2 }, { "epoch": 4.479651184494434e-05, "grad_norm": 77.0, "learning_rate": 0.0001, "loss": 5.2211, "loss/crossentropy": 3.0718085765838623, "loss/fcd": 4.703125, "loss/idx": 0.0, "loss/logits": 0.5179921388626099, "step": 3 }, { "epoch": 5.9728682459925785e-05, "grad_norm": 66.5, "learning_rate": 0.0001, "loss": 4.9244, "loss/crossentropy": 2.7016193866729736, "loss/fcd": 4.5, "loss/idx": 0.0, "loss/logits": 0.4243808537721634, "step": 4 }, { "epoch": 7.466085307490723e-05, "grad_norm": 55.75, "learning_rate": 0.0001, "loss": 4.6617, "loss/crossentropy": 2.9749388694763184, "loss/fcd": 4.265625, "loss/idx": 0.0, "loss/logits": 0.39606572687625885, "step": 5 }, { "epoch": 8.959302368988868e-05, "grad_norm": 52.25, "learning_rate": 0.0001, "loss": 4.4842, "loss/crossentropy": 2.7478275299072266, "loss/fcd": 4.09375, "loss/idx": 0.0, "loss/logits": 0.39042919874191284, "step": 6 }, { "epoch": 0.00010452519430487013, "grad_norm": 58.25, "learning_rate": 0.0001, "loss": 4.7182, "loss/crossentropy": 3.114223837852478, "loss/fcd": 4.3125, "loss/idx": 0.0, "loss/logits": 0.4056805968284607, "step": 7 }, { "epoch": 0.00011945736491985157, "grad_norm": 47.75, "learning_rate": 0.0001, "loss": 4.4918, "loss/crossentropy": 2.852568507194519, "loss/fcd": 4.109375, "loss/idx": 0.0, "loss/logits": 0.3824669420719147, "step": 8 }, { "epoch": 0.000134389535534833, "grad_norm": 40.25, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 2.582517981529236, "loss/fcd": 3.8828125, "loss/idx": 0.0, "loss/logits": 0.3435918539762497, "step": 9 }, { "epoch": 0.00014932170614981446, "grad_norm": 32.25, "learning_rate": 0.0001, "loss": 3.967, "loss/crossentropy": 2.6108912229537964, "loss/fcd": 3.6328125, "loss/idx": 0.0, "loss/logits": 0.33417809009552, "step": 10 }, { "epoch": 0.0001642538767647959, "grad_norm": 33.75, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 2.5188854932785034, "loss/fcd": 3.75, "loss/idx": 0.0, "loss/logits": 0.3511316478252411, "step": 11 }, { "epoch": 0.00017918604737977736, "grad_norm": 32.25, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 2.6797866821289062, "loss/fcd": 3.6171875, "loss/idx": 0.0, "loss/logits": 0.36099183559417725, "step": 12 }, { "epoch": 0.00019411821799475881, "grad_norm": 30.125, "learning_rate": 0.0001, "loss": 3.7539, "loss/crossentropy": 2.842925786972046, "loss/fcd": 3.4453125, "loss/idx": 0.0, "loss/logits": 0.30860865116119385, "step": 13 }, { "epoch": 0.00020905038860974027, "grad_norm": 25.625, "learning_rate": 0.0001, "loss": 3.6666, "loss/crossentropy": 2.795591711997986, "loss/fcd": 3.375, "loss/idx": 0.0, "loss/logits": 0.29162760078907013, "step": 14 }, { "epoch": 0.0002239825592247217, "grad_norm": 24.25, "learning_rate": 0.0001, "loss": 3.5034, "loss/crossentropy": 2.6695363521575928, "loss/fcd": 3.21875, "loss/idx": 0.0, "loss/logits": 0.28466810286045074, "step": 15 }, { "epoch": 0.00023891472983970314, "grad_norm": 19.0, "grad_norm_var": 420.071875, "learning_rate": 0.0001, "loss": 3.2914, "loss/crossentropy": 2.761489510536194, "loss/fcd": 3.0546875, "loss/idx": 0.0, "loss/logits": 0.2366952747106552, "step": 16 }, { "epoch": 0.0002538469004546846, "grad_norm": 19.75, "grad_norm_var": 375.11015625, "learning_rate": 0.0001, "loss": 3.3177, "loss/crossentropy": 2.822671890258789, "loss/fcd": 3.078125, "loss/idx": 0.0, "loss/logits": 0.23954815417528152, "step": 17 }, { "epoch": 0.000268779071069666, "grad_norm": 18.25, "grad_norm_var": 326.3354166666667, "learning_rate": 0.0001, "loss": 3.3004, "loss/crossentropy": 2.580729126930237, "loss/fcd": 3.03125, "loss/idx": 0.0, "loss/logits": 0.269170880317688, "step": 18 }, { "epoch": 0.00028371124168464747, "grad_norm": 15.125, "grad_norm_var": 256.7582682291667, "learning_rate": 0.0001, "loss": 2.9912, "loss/crossentropy": 2.450445771217346, "loss/fcd": 2.78125, "loss/idx": 0.0, "loss/logits": 0.20998132228851318, "step": 19 }, { "epoch": 0.0002986434122996289, "grad_norm": 13.75, "grad_norm_var": 214.0087890625, "learning_rate": 0.0001, "loss": 3.1281, "loss/crossentropy": 2.683855891227722, "loss/fcd": 2.875, "loss/idx": 0.0, "loss/logits": 0.2530509978532791, "step": 20 }, { "epoch": 0.00031357558291461037, "grad_norm": 12.125, "grad_norm_var": 197.12682291666667, "learning_rate": 0.0001, "loss": 2.864, "loss/crossentropy": 2.523205280303955, "loss/fcd": 2.671875, "loss/idx": 0.0, "loss/logits": 0.1921716332435608, "step": 21 }, { "epoch": 0.0003285077535295918, "grad_norm": 10.5625, "grad_norm_var": 180.245556640625, "learning_rate": 0.0001, "loss": 2.8605, "loss/crossentropy": 2.6747522354125977, "loss/fcd": 2.65625, "loss/idx": 0.0, "loss/logits": 0.20420243591070175, "step": 22 }, { "epoch": 0.0003434399241445733, "grad_norm": 8.875, "grad_norm_var": 127.321728515625, "learning_rate": 0.0001, "loss": 2.7268, "loss/crossentropy": 2.48712956905365, "loss/fcd": 2.515625, "loss/idx": 0.0, "loss/logits": 0.2112211287021637, "step": 23 }, { "epoch": 0.0003583720947595547, "grad_norm": 7.5625, "grad_norm_var": 100.89635416666667, "learning_rate": 0.0001, "loss": 2.7417, "loss/crossentropy": 2.464865803718567, "loss/fcd": 2.53125, "loss/idx": 0.0, "loss/logits": 0.2104937955737114, "step": 24 }, { "epoch": 0.0003733042653745362, "grad_norm": 6.71875, "grad_norm_var": 87.20006103515625, "learning_rate": 0.0001, "loss": 2.5018, "loss/crossentropy": 2.6716034412384033, "loss/fcd": 2.3125, "loss/idx": 0.0, "loss/logits": 0.18925867974758148, "step": 25 }, { "epoch": 0.00038823643598951763, "grad_norm": 6.59375, "grad_norm_var": 84.29034830729167, "learning_rate": 0.0001, "loss": 2.6178, "loss/crossentropy": 2.6646894216537476, "loss/fcd": 2.421875, "loss/idx": 0.0, "loss/logits": 0.19596357643604279, "step": 26 }, { "epoch": 0.0004031686066044991, "grad_norm": 5.25, "grad_norm_var": 74.33019205729167, "learning_rate": 0.0001, "loss": 2.5377, "loss/crossentropy": 2.4073028564453125, "loss/fcd": 2.359375, "loss/idx": 0.0, "loss/logits": 0.17834600806236267, "step": 27 }, { "epoch": 0.00041810077721948053, "grad_norm": 4.9375, "grad_norm_var": 61.733723958333336, "learning_rate": 0.0001, "loss": 2.3672, "loss/crossentropy": 2.63876473903656, "loss/fcd": 2.1875, "loss/idx": 0.0, "loss/logits": 0.17965663224458694, "step": 28 }, { "epoch": 0.000433032947834462, "grad_norm": 4.78125, "grad_norm_var": 48.33915608723958, "learning_rate": 0.0001, "loss": 2.5811, "loss/crossentropy": 2.325456917285919, "loss/fcd": 2.375, "loss/idx": 0.0, "loss/logits": 0.20605524629354477, "step": 29 }, { "epoch": 0.0004479651184494434, "grad_norm": 5.09375, "grad_norm_var": 39.295247395833336, "learning_rate": 0.0001, "loss": 2.5423, "loss/crossentropy": 2.83246386051178, "loss/fcd": 2.359375, "loss/idx": 0.0, "loss/logits": 0.18287606537342072, "step": 30 }, { "epoch": 0.00046289728906442483, "grad_norm": 3.515625, "grad_norm_var": 30.678872680664064, "learning_rate": 0.0001, "loss": 2.3684, "loss/crossentropy": 2.755223870277405, "loss/fcd": 2.1640625, "loss/idx": 0.0, "loss/logits": 0.20429068058729172, "step": 31 }, { "epoch": 0.0004778294596794063, "grad_norm": 5.6875, "grad_norm_var": 25.990029907226564, "learning_rate": 0.0001, "loss": 3.0309, "loss/crossentropy": 2.2701921463012695, "loss/fcd": 2.7265625, "loss/idx": 0.0, "loss/logits": 0.3043238967657089, "step": 32 }, { "epoch": 0.0004927616302943877, "grad_norm": 5.0, "grad_norm_var": 19.008747355143228, "learning_rate": 0.0001, "loss": 2.7374, "loss/crossentropy": 2.8550366163253784, "loss/fcd": 2.4375, "loss/idx": 0.0, "loss/logits": 0.29989800602197647, "step": 33 }, { "epoch": 0.0005076938009093692, "grad_norm": 3.25, "grad_norm_var": 13.29976298014323, "learning_rate": 0.0001, "loss": 2.3173, "loss/crossentropy": 2.6526776552200317, "loss/fcd": 2.125, "loss/idx": 0.0, "loss/logits": 0.19229362159967422, "step": 34 }, { "epoch": 0.0005226259715243506, "grad_norm": 3.15625, "grad_norm_var": 9.967837524414062, "learning_rate": 0.0001, "loss": 2.0892, "loss/crossentropy": 2.480757474899292, "loss/fcd": 1.9375, "loss/idx": 0.0, "loss/logits": 0.15168970823287964, "step": 35 }, { "epoch": 0.000537558142139332, "grad_norm": 4.15625, "grad_norm_var": 6.6749827067057295, "learning_rate": 0.0001, "loss": 2.1965, "loss/crossentropy": 2.3921128511428833, "loss/fcd": 2.01953125, "loss/idx": 0.0, "loss/logits": 0.17700091004371643, "step": 36 }, { "epoch": 0.0005524903127543135, "grad_norm": 3.34375, "grad_norm_var": 4.415640258789063, "learning_rate": 0.0001, "loss": 2.1708, "loss/crossentropy": 2.6723674535751343, "loss/fcd": 2.01171875, "loss/idx": 0.0, "loss/logits": 0.15905070304870605, "step": 37 }, { "epoch": 0.0005674224833692949, "grad_norm": 3.09375, "grad_norm_var": 2.8907704671223957, "learning_rate": 0.0001, "loss": 2.1407, "loss/crossentropy": 2.612633228302002, "loss/fcd": 1.9921875, "loss/idx": 0.0, "loss/logits": 0.14852391928434372, "step": 38 }, { "epoch": 0.0005823546539842764, "grad_norm": 3.640625, "grad_norm_var": 1.9430623372395834, "learning_rate": 0.0001, "loss": 2.616, "loss/crossentropy": 2.366433620452881, "loss/fcd": 2.296875, "loss/idx": 0.0, "loss/logits": 0.31916864961385727, "step": 39 }, { "epoch": 0.0005972868245992578, "grad_norm": 3.171875, "grad_norm_var": 1.4934234619140625, "learning_rate": 0.0001, "loss": 2.2019, "loss/crossentropy": 2.4993181228637695, "loss/fcd": 2.0, "loss/idx": 0.0, "loss/logits": 0.20192894339561462, "step": 40 }, { "epoch": 0.0006122189952142393, "grad_norm": 2.984375, "grad_norm_var": 1.2413045247395833, "learning_rate": 0.0001, "loss": 2.1757, "loss/crossentropy": 2.4306472539901733, "loss/fcd": 1.98046875, "loss/idx": 0.0, "loss/logits": 0.19518503546714783, "step": 41 }, { "epoch": 0.0006271511658292207, "grad_norm": 3.265625, "grad_norm_var": 0.8840077718098959, "learning_rate": 0.0001, "loss": 2.1187, "loss/crossentropy": 2.8436198234558105, "loss/fcd": 1.9375, "loss/idx": 0.0, "loss/logits": 0.18118727207183838, "step": 42 }, { "epoch": 0.0006420833364442022, "grad_norm": 2.65625, "grad_norm_var": 0.8792795817057292, "learning_rate": 0.0001, "loss": 2.2649, "loss/crossentropy": 2.4525002241134644, "loss/fcd": 2.05078125, "loss/idx": 0.0, "loss/logits": 0.21407584100961685, "step": 43 }, { "epoch": 0.0006570155070591836, "grad_norm": 2.828125, "grad_norm_var": 0.8538736979166667, "learning_rate": 0.0001, "loss": 2.1198, "loss/crossentropy": 2.4759527444839478, "loss/fcd": 1.92578125, "loss/idx": 0.0, "loss/logits": 0.1939752697944641, "step": 44 }, { "epoch": 0.0006719476776741652, "grad_norm": 2.875, "grad_norm_var": 0.8129191080729167, "learning_rate": 0.0001, "loss": 2.2037, "loss/crossentropy": 2.444726586341858, "loss/fcd": 1.99609375, "loss/idx": 0.0, "loss/logits": 0.20757701992988586, "step": 45 }, { "epoch": 0.0006868798482891465, "grad_norm": 2.390625, "grad_norm_var": 0.7339019775390625, "learning_rate": 0.0001, "loss": 1.8268, "loss/crossentropy": 2.5968172550201416, "loss/fcd": 1.68359375, "loss/idx": 0.0, "loss/logits": 0.14320842921733856, "step": 46 }, { "epoch": 0.000701812018904128, "grad_norm": 2.328125, "grad_norm_var": 0.8098215738932292, "learning_rate": 0.0001, "loss": 1.93, "loss/crossentropy": 2.68948233127594, "loss/fcd": 1.765625, "loss/idx": 0.0, "loss/logits": 0.1643698811531067, "step": 47 }, { "epoch": 0.0007167441895191095, "grad_norm": 2.3125, "grad_norm_var": 0.47627665201822916, "learning_rate": 0.0001, "loss": 1.9692, "loss/crossentropy": 2.520722985267639, "loss/fcd": 1.8125, "loss/idx": 0.0, "loss/logits": 0.15668785572052002, "step": 48 }, { "epoch": 0.0007316763601340908, "grad_norm": 2.25, "grad_norm_var": 0.2718170166015625, "learning_rate": 0.0001, "loss": 1.8559, "loss/crossentropy": 2.518172264099121, "loss/fcd": 1.69921875, "loss/idx": 0.0, "loss/logits": 0.15663356333971024, "step": 49 }, { "epoch": 0.0007466085307490724, "grad_norm": 2.453125, "grad_norm_var": 0.28297119140625, "learning_rate": 0.0001, "loss": 1.9511, "loss/crossentropy": 2.5404844284057617, "loss/fcd": 1.77734375, "loss/idx": 0.0, "loss/logits": 0.1737901046872139, "step": 50 }, { "epoch": 0.0007615407013640538, "grad_norm": 2.765625, "grad_norm_var": 0.28080952962239586, "learning_rate": 0.0001, "loss": 1.9093, "loss/crossentropy": 2.7360819578170776, "loss/fcd": 1.7421875, "loss/idx": 0.0, "loss/logits": 0.1670687422156334, "step": 51 }, { "epoch": 0.0007764728719790353, "grad_norm": 2.703125, "grad_norm_var": 0.17078450520833333, "learning_rate": 0.0001, "loss": 2.1227, "loss/crossentropy": 2.581022262573242, "loss/fcd": 1.91796875, "loss/idx": 0.0, "loss/logits": 0.20477516949176788, "step": 52 }, { "epoch": 0.0007914050425940167, "grad_norm": 27.25, "grad_norm_var": 37.57099202473958, "learning_rate": 0.0001, "loss": 1.8477, "loss/crossentropy": 2.7782737016677856, "loss/fcd": 1.69921875, "loss/idx": 0.0, "loss/logits": 0.14849026501178741, "step": 53 }, { "epoch": 0.0008063372132089982, "grad_norm": 2.515625, "grad_norm_var": 37.68567606608073, "learning_rate": 0.0001, "loss": 2.2443, "loss/crossentropy": 2.6635544300079346, "loss/fcd": 1.96875, "loss/idx": 0.0, "loss/logits": 0.275559701025486, "step": 54 }, { "epoch": 0.0008212693838239796, "grad_norm": 1.9921875, "grad_norm_var": 37.9948117574056, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.4618231058120728, "loss/fcd": 1.62109375, "loss/idx": 0.0, "loss/logits": 0.144667848944664, "step": 55 }, { "epoch": 0.0008362015544389611, "grad_norm": 3.015625, "grad_norm_var": 38.01716079711914, "learning_rate": 0.0001, "loss": 1.8995, "loss/crossentropy": 2.638582944869995, "loss/fcd": 1.73828125, "loss/idx": 0.0, "loss/logits": 0.16121716797351837, "step": 56 }, { "epoch": 0.0008511337250539425, "grad_norm": 2.046875, "grad_norm_var": 38.21924819946289, "learning_rate": 0.0001, "loss": 1.9172, "loss/crossentropy": 2.588966488838196, "loss/fcd": 1.74609375, "loss/idx": 0.0, "loss/logits": 0.17108920216560364, "step": 57 }, { "epoch": 0.000866065895668924, "grad_norm": 1.953125, "grad_norm_var": 38.47345962524414, "learning_rate": 0.0001, "loss": 1.7661, "loss/crossentropy": 2.5097049474716187, "loss/fcd": 1.63671875, "loss/idx": 0.0, "loss/logits": 0.12938163056969643, "step": 58 }, { "epoch": 0.0008809980662839054, "grad_norm": 2.953125, "grad_norm_var": 38.424946848551436, "learning_rate": 0.0001, "loss": 1.9371, "loss/crossentropy": 2.5010175704956055, "loss/fcd": 1.765625, "loss/idx": 0.0, "loss/logits": 0.17150144279003143, "step": 59 }, { "epoch": 0.0008959302368988868, "grad_norm": 1.7578125, "grad_norm_var": 38.66942545572917, "learning_rate": 0.0001, "loss": 1.7706, "loss/crossentropy": 2.5571951866149902, "loss/fcd": 1.61328125, "loss/idx": 0.0, "loss/logits": 0.1572706550359726, "step": 60 }, { "epoch": 0.0009108624075138683, "grad_norm": 1.890625, "grad_norm_var": 38.87405497233073, "learning_rate": 0.0001, "loss": 1.7896, "loss/crossentropy": 2.4101879596710205, "loss/fcd": 1.640625, "loss/idx": 0.0, "loss/logits": 0.14897086471319199, "step": 61 }, { "epoch": 0.0009257945781288497, "grad_norm": 1.6953125, "grad_norm_var": 39.04523493448893, "learning_rate": 0.0001, "loss": 1.7102, "loss/crossentropy": 2.5094202756881714, "loss/fcd": 1.56640625, "loss/idx": 0.0, "loss/logits": 0.14381400495767593, "step": 62 }, { "epoch": 0.0009407267487438312, "grad_norm": 1.6953125, "grad_norm_var": 39.20016276041667, "learning_rate": 0.0001, "loss": 1.8086, "loss/crossentropy": 2.76530921459198, "loss/fcd": 1.640625, "loss/idx": 0.0, "loss/logits": 0.1680033802986145, "step": 63 }, { "epoch": 0.0009556589193588126, "grad_norm": 1.9296875, "grad_norm_var": 39.2866818745931, "learning_rate": 0.0001, "loss": 1.9316, "loss/crossentropy": 2.355651021003723, "loss/fcd": 1.75390625, "loss/idx": 0.0, "loss/logits": 0.17767927050590515, "step": 64 }, { "epoch": 0.0009705910899737941, "grad_norm": 1.7578125, "grad_norm_var": 39.40381673177083, "learning_rate": 0.0001, "loss": 1.7647, "loss/crossentropy": 2.4329527616500854, "loss/fcd": 1.61328125, "loss/idx": 0.0, "loss/logits": 0.1514434814453125, "step": 65 }, { "epoch": 0.0009855232605887755, "grad_norm": 2.421875, "grad_norm_var": 39.40937906901042, "learning_rate": 0.0001, "loss": 2.1449, "loss/crossentropy": 2.527339816093445, "loss/fcd": 1.8828125, "loss/idx": 0.0, "loss/logits": 0.26208290457725525, "step": 66 }, { "epoch": 0.0010004554312037569, "grad_norm": 1.7109375, "grad_norm_var": 39.62035090128581, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.788802742958069, "loss/fcd": 1.5859375, "loss/idx": 0.0, "loss/logits": 0.16401013731956482, "step": 67 }, { "epoch": 0.0010153876018187385, "grad_norm": 1.7578125, "grad_norm_var": 39.80255126953125, "learning_rate": 0.0001, "loss": 1.7925, "loss/crossentropy": 2.6775633096694946, "loss/fcd": 1.6171875, "loss/idx": 0.0, "loss/logits": 0.17531326413154602, "step": 68 }, { "epoch": 0.0010303197724337199, "grad_norm": 1.7578125, "grad_norm_var": 0.19091161092122397, "learning_rate": 0.0001, "loss": 1.8299, "loss/crossentropy": 2.444396138191223, "loss/fcd": 1.671875, "loss/idx": 0.0, "loss/logits": 0.15805941075086594, "step": 69 }, { "epoch": 0.0010452519430487013, "grad_norm": 1.9609375, "grad_norm_var": 0.1759429931640625, "learning_rate": 0.0001, "loss": 1.9339, "loss/crossentropy": 2.4972236156463623, "loss/fcd": 1.7109375, "loss/idx": 0.0, "loss/logits": 0.22294466942548752, "step": 70 }, { "epoch": 0.0010601841136636827, "grad_norm": 1.4296875, "grad_norm_var": 0.1976959228515625, "learning_rate": 0.0001, "loss": 1.6659, "loss/crossentropy": 2.6186927556991577, "loss/fcd": 1.515625, "loss/idx": 0.0, "loss/logits": 0.15028595924377441, "step": 71 }, { "epoch": 0.001075116284278664, "grad_norm": 4.09375, "grad_norm_var": 0.4187255859375, "learning_rate": 0.0001, "loss": 1.8539, "loss/crossentropy": 2.429172396659851, "loss/fcd": 1.6640625, "loss/idx": 0.0, "loss/logits": 0.1898861974477768, "step": 72 }, { "epoch": 0.0010900484548936457, "grad_norm": 1.9296875, "grad_norm_var": 0.4196449279785156, "learning_rate": 0.0001, "loss": 1.9193, "loss/crossentropy": 2.364608407020569, "loss/fcd": 1.7265625, "loss/idx": 0.0, "loss/logits": 0.19276107847690582, "step": 73 }, { "epoch": 0.001104980625508627, "grad_norm": 1.5234375, "grad_norm_var": 0.43635965983072916, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.489943504333496, "loss/fcd": 1.49609375, "loss/idx": 0.0, "loss/logits": 0.15067894011735916, "step": 74 }, { "epoch": 0.0011199127961236085, "grad_norm": 1.84375, "grad_norm_var": 0.37475179036458334, "learning_rate": 0.0001, "loss": 1.9659, "loss/crossentropy": 2.5017722845077515, "loss/fcd": 1.74609375, "loss/idx": 0.0, "loss/logits": 0.21985302865505219, "step": 75 }, { "epoch": 0.0011348449667385899, "grad_norm": 1.5859375, "grad_norm_var": 0.38093973795572916, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.523123264312744, "loss/fcd": 1.5234375, "loss/idx": 0.0, "loss/logits": 0.14388950169086456, "step": 76 }, { "epoch": 0.0011497771373535715, "grad_norm": 1.5703125, "grad_norm_var": 0.38931248982747396, "learning_rate": 0.0001, "loss": 1.7218, "loss/crossentropy": 2.501427173614502, "loss/fcd": 1.55859375, "loss/idx": 0.0, "loss/logits": 0.16323504596948624, "step": 77 }, { "epoch": 0.0011647093079685529, "grad_norm": 2.4375, "grad_norm_var": 0.40185139973958334, "learning_rate": 0.0001, "loss": 1.7289, "loss/crossentropy": 2.392626166343689, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.1468452885746956, "step": 78 }, { "epoch": 0.0011796414785835343, "grad_norm": 1.8515625, "grad_norm_var": 0.397802734375, "learning_rate": 0.0001, "loss": 1.6939, "loss/crossentropy": 2.572135329246521, "loss/fcd": 1.54296875, "loss/idx": 0.0, "loss/logits": 0.15093941986560822, "step": 79 }, { "epoch": 0.0011945736491985157, "grad_norm": 1.703125, "grad_norm_var": 0.4023089090983073, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.265039086341858, "loss/fcd": 1.5625, "loss/idx": 0.0, "loss/logits": 0.17782587558031082, "step": 80 }, { "epoch": 0.0012095058198134973, "grad_norm": 1.3203125, "grad_norm_var": 0.4259783426920573, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.432965636253357, "loss/fcd": 1.37890625, "loss/idx": 0.0, "loss/logits": 0.1092129796743393, "step": 81 }, { "epoch": 0.0012244379904284787, "grad_norm": 1.5546875, "grad_norm_var": 0.4162394205729167, "learning_rate": 0.0001, "loss": 1.7744, "loss/crossentropy": 2.387954354286194, "loss/fcd": 1.609375, "loss/idx": 0.0, "loss/logits": 0.16505713760852814, "step": 82 }, { "epoch": 0.00123937016104346, "grad_norm": 2.203125, "grad_norm_var": 0.4204851786295573, "learning_rate": 0.0001, "loss": 1.985, "loss/crossentropy": 2.412451148033142, "loss/fcd": 1.76953125, "loss/idx": 0.0, "loss/logits": 0.21551693975925446, "step": 83 }, { "epoch": 0.0012543023316584415, "grad_norm": 1.6328125, "grad_norm_var": 0.42396011352539065, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.3627136945724487, "loss/fcd": 1.609375, "loss/idx": 0.0, "loss/logits": 0.1669153794646263, "step": 84 }, { "epoch": 0.0012692345022734229, "grad_norm": 2.1875, "grad_norm_var": 0.4273590087890625, "learning_rate": 0.0001, "loss": 1.8321, "loss/crossentropy": 2.3265154361724854, "loss/fcd": 1.65234375, "loss/idx": 0.0, "loss/logits": 0.17975304275751114, "step": 85 }, { "epoch": 0.0012841666728884045, "grad_norm": 1.8125, "grad_norm_var": 0.4280596415201823, "learning_rate": 0.0001, "loss": 1.8219, "loss/crossentropy": 2.644020676612854, "loss/fcd": 1.65234375, "loss/idx": 0.0, "loss/logits": 0.16957848519086838, "step": 86 }, { "epoch": 0.001299098843503386, "grad_norm": 1.4921875, "grad_norm_var": 0.4242388407389323, "learning_rate": 0.0001, "loss": 1.644, "loss/crossentropy": 2.58134126663208, "loss/fcd": 1.484375, "loss/idx": 0.0, "loss/logits": 0.15963882207870483, "step": 87 }, { "epoch": 0.0013140310141183673, "grad_norm": 1.9765625, "grad_norm_var": 0.09115397135416667, "learning_rate": 0.0001, "loss": 1.7601, "loss/crossentropy": 2.463197350502014, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.17805806547403336, "step": 88 }, { "epoch": 0.0013289631847333487, "grad_norm": 1.4609375, "grad_norm_var": 0.09609781901041667, "learning_rate": 0.0001, "loss": 1.6648, "loss/crossentropy": 2.7656772136688232, "loss/fcd": 1.5078125, "loss/idx": 0.0, "loss/logits": 0.15694016218185425, "step": 89 }, { "epoch": 0.0013438953553483303, "grad_norm": 22.5, "grad_norm_var": 26.936128489176433, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.4002861976623535, "loss/fcd": 1.4765625, "loss/idx": 0.0, "loss/logits": 0.137087844312191, "step": 90 }, { "epoch": 0.0013588275259633117, "grad_norm": 1.8125, "grad_norm_var": 26.941302235921224, "learning_rate": 0.0001, "loss": 1.6349, "loss/crossentropy": 2.4773519039154053, "loss/fcd": 1.484375, "loss/idx": 0.0, "loss/logits": 0.15047870576381683, "step": 91 }, { "epoch": 0.001373759696578293, "grad_norm": 1.765625, "grad_norm_var": 26.907792154947916, "learning_rate": 0.0001, "loss": 1.7613, "loss/crossentropy": 2.472650408744812, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.17930901050567627, "step": 92 }, { "epoch": 0.0013886918671932745, "grad_norm": 1.53125, "grad_norm_var": 26.915750885009764, "learning_rate": 0.0001, "loss": 1.6722, "loss/crossentropy": 2.6694291830062866, "loss/fcd": 1.51953125, "loss/idx": 0.0, "loss/logits": 0.15264244377613068, "step": 93 }, { "epoch": 0.001403624037808256, "grad_norm": 0.8984375, "grad_norm_var": 27.195156860351563, "learning_rate": 0.0001, "loss": 1.8717, "loss/crossentropy": 2.4455777406692505, "loss/fcd": 1.6875, "loss/idx": 0.5, "loss/logits": 0.18417862057685852, "step": 94 }, { "epoch": 0.0014185562084232375, "grad_norm": 1.0390625, "grad_norm_var": 27.35882059733073, "learning_rate": 0.0001, "loss": 1.6424, "loss/crossentropy": 2.5914158821105957, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.1463368535041809, "step": 95 }, { "epoch": 0.001433488379038219, "grad_norm": 0.7890625, "grad_norm_var": 27.560646311442056, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.660555124282837, "loss/fcd": 1.3984375, "loss/idx": 0.5, "loss/logits": 0.1364329755306244, "step": 96 }, { "epoch": 0.0014484205496532003, "grad_norm": 0.76171875, "grad_norm_var": 27.695830726623534, "learning_rate": 0.0001, "loss": 1.6325, "loss/crossentropy": 2.461188316345215, "loss/fcd": 1.48046875, "loss/idx": 0.5, "loss/logits": 0.1520090326666832, "step": 97 }, { "epoch": 0.0014633527202681817, "grad_norm": 0.6953125, "grad_norm_var": 27.88910617828369, "learning_rate": 0.0001, "loss": 1.6549, "loss/crossentropy": 2.6517964601516724, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.15884529054164886, "step": 98 }, { "epoch": 0.0014782848908831633, "grad_norm": 0.8359375, "grad_norm_var": 28.111986223856608, "learning_rate": 0.0001, "loss": 1.5727, "loss/crossentropy": 2.5368658304214478, "loss/fcd": 1.43359375, "loss/idx": 0.5, "loss/logits": 0.13915108144283295, "step": 99 }, { "epoch": 0.0014932170614981447, "grad_norm": 0.84765625, "grad_norm_var": 28.26218058268229, "learning_rate": 0.0001, "loss": 1.9082, "loss/crossentropy": 2.5145949125289917, "loss/fcd": 1.6953125, "loss/idx": 0.5, "loss/logits": 0.21293380111455917, "step": 100 }, { "epoch": 0.001508149232113126, "grad_norm": 0.7890625, "grad_norm_var": 28.47071711222331, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.6361730098724365, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.17047739028930664, "step": 101 }, { "epoch": 0.0015230814027281075, "grad_norm": 0.90234375, "grad_norm_var": 28.61356601715088, "learning_rate": 0.0001, "loss": 1.8376, "loss/crossentropy": 2.5776859521865845, "loss/fcd": 1.640625, "loss/idx": 0.5, "loss/logits": 0.19692759215831757, "step": 102 }, { "epoch": 0.0015380135733430891, "grad_norm": 7.6875, "grad_norm_var": 30.174897702534995, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.398188829421997, "loss/fcd": 1.4765625, "loss/idx": 0.5, "loss/logits": 0.1287732645869255, "step": 103 }, { "epoch": 0.0015529457439580705, "grad_norm": 0.84765625, "grad_norm_var": 30.39253921508789, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.597610116004944, "loss/fcd": 1.46875, "loss/idx": 0.5, "loss/logits": 0.14246898889541626, "step": 104 }, { "epoch": 0.001567877914573052, "grad_norm": 0.92578125, "grad_norm_var": 30.507610003153484, "learning_rate": 0.0001, "loss": 1.9468, "loss/crossentropy": 2.6051762104034424, "loss/fcd": 1.73828125, "loss/idx": 0.5, "loss/logits": 0.20850396901369095, "step": 105 }, { "epoch": 0.0015828100851880333, "grad_norm": 0.7734375, "grad_norm_var": 2.9109150568644204, "learning_rate": 0.0001, "loss": 1.754, "loss/crossentropy": 2.248218297958374, "loss/fcd": 1.5703125, "loss/idx": 0.5, "loss/logits": 0.18368404731154442, "step": 106 }, { "epoch": 0.0015977422558030147, "grad_norm": 0.828125, "grad_norm_var": 2.921457354227702, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.530861020088196, "loss/fcd": 1.453125, "loss/idx": 0.5, "loss/logits": 0.1389201432466507, "step": 107 }, { "epoch": 0.0016126744264179963, "grad_norm": 0.7734375, "grad_norm_var": 2.930629920959473, "learning_rate": 0.0001, "loss": 1.6142, "loss/crossentropy": 2.6938165426254272, "loss/fcd": 1.45703125, "loss/idx": 0.5, "loss/logits": 0.1571795791387558, "step": 108 }, { "epoch": 0.0016276065970329777, "grad_norm": 0.72265625, "grad_norm_var": 2.947409820556641, "learning_rate": 0.0001, "loss": 1.6045, "loss/crossentropy": 2.629867672920227, "loss/fcd": 1.4453125, "loss/idx": 0.5, "loss/logits": 0.15913766622543335, "step": 109 }, { "epoch": 0.0016425387676479591, "grad_norm": 0.63671875, "grad_norm_var": 2.9642145156860353, "learning_rate": 0.0001, "loss": 1.5314, "loss/crossentropy": 2.1994539499282837, "loss/fcd": 1.41015625, "loss/idx": 0.5, "loss/logits": 0.12125381454825401, "step": 110 }, { "epoch": 0.0016574709382629405, "grad_norm": 1.1015625, "grad_norm_var": 2.9627761205037433, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.5422849655151367, "loss/fcd": 1.57421875, "loss/idx": 0.5, "loss/logits": 0.19153974950313568, "step": 111 }, { "epoch": 0.0016724031088779221, "grad_norm": 0.95703125, "grad_norm_var": 2.9543312072753904, "learning_rate": 0.0001, "loss": 1.6532, "loss/crossentropy": 2.4952173233032227, "loss/fcd": 1.5078125, "loss/idx": 0.5, "loss/logits": 0.14537867531180382, "step": 112 }, { "epoch": 0.0016873352794929035, "grad_norm": 1.1640625, "grad_norm_var": 2.9379663467407227, "learning_rate": 0.0001, "loss": 1.6542, "loss/crossentropy": 2.6476502418518066, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.1580853909254074, "step": 113 }, { "epoch": 0.001702267450107885, "grad_norm": 0.8359375, "grad_norm_var": 2.9282297134399413, "learning_rate": 0.0001, "loss": 1.5115, "loss/crossentropy": 2.4073606729507446, "loss/fcd": 1.37890625, "loss/idx": 0.5, "loss/logits": 0.1326112598180771, "step": 114 }, { "epoch": 0.0017171996207228663, "grad_norm": 0.71484375, "grad_norm_var": 2.9364662170410156, "learning_rate": 0.0001, "loss": 1.6048, "loss/crossentropy": 2.5658940076828003, "loss/fcd": 1.453125, "loss/idx": 0.5, "loss/logits": 0.15171286836266518, "step": 115 }, { "epoch": 0.001732131791337848, "grad_norm": 0.65625, "grad_norm_var": 2.949834124247233, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.807284355163574, "loss/fcd": 1.41015625, "loss/idx": 0.5, "loss/logits": 0.14778884127736092, "step": 116 }, { "epoch": 0.0017470639619528293, "grad_norm": 0.5703125, "grad_norm_var": 2.966845639546712, "learning_rate": 0.0001, "loss": 1.5297, "loss/crossentropy": 2.722702145576477, "loss/fcd": 1.3984375, "loss/idx": 0.5, "loss/logits": 0.1313047930598259, "step": 117 }, { "epoch": 0.0017619961325678107, "grad_norm": 0.609375, "grad_norm_var": 2.986028798421224, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.434728503227234, "loss/fcd": 1.38671875, "loss/idx": 0.5, "loss/logits": 0.13587873429059982, "step": 118 }, { "epoch": 0.0017769283031827921, "grad_norm": 0.796875, "grad_norm_var": 0.027905019124348958, "learning_rate": 0.0001, "loss": 1.5845, "loss/crossentropy": 2.431328058242798, "loss/fcd": 1.43359375, "loss/idx": 0.5, "loss/logits": 0.1509154662489891, "step": 119 }, { "epoch": 0.0017918604737977735, "grad_norm": 0.76171875, "grad_norm_var": 0.0279022216796875, "learning_rate": 0.0001, "loss": 1.6357, "loss/crossentropy": 2.545639991760254, "loss/fcd": 1.48046875, "loss/idx": 0.5, "loss/logits": 0.1552198976278305, "step": 120 }, { "epoch": 0.0018067926444127551, "grad_norm": 0.609375, "grad_norm_var": 0.028927040100097657, "learning_rate": 0.0001, "loss": 1.4819, "loss/crossentropy": 2.671551823616028, "loss/fcd": 1.35546875, "loss/idx": 0.5, "loss/logits": 0.1264294758439064, "step": 121 }, { "epoch": 0.0018217248150277365, "grad_norm": 0.7265625, "grad_norm_var": 0.029117774963378907, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.7223843336105347, "loss/fcd": 1.40625, "loss/idx": 0.5, "loss/logits": 0.14684423804283142, "step": 122 }, { "epoch": 0.001836656985642718, "grad_norm": 0.8125, "grad_norm_var": 0.029030799865722656, "learning_rate": 0.0001, "loss": 1.7009, "loss/crossentropy": 2.638063430786133, "loss/fcd": 1.53515625, "loss/idx": 0.5, "loss/logits": 0.16576898843050003, "step": 123 }, { "epoch": 0.0018515891562576993, "grad_norm": 0.7578125, "grad_norm_var": 0.02905572255452474, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.351641535758972, "loss/fcd": 1.546875, "loss/idx": 0.5, "loss/logits": 0.18313253670930862, "step": 124 }, { "epoch": 0.001866521326872681, "grad_norm": 0.62890625, "grad_norm_var": 0.03028558095296224, "learning_rate": 0.0001, "loss": 1.5896, "loss/crossentropy": 2.654516100883484, "loss/fcd": 1.4375, "loss/idx": 0.5, "loss/logits": 0.15209884196519852, "step": 125 }, { "epoch": 0.0018814534974876623, "grad_norm": 0.7109375, "grad_norm_var": 0.02929865519205729, "learning_rate": 0.0001, "loss": 1.8795, "loss/crossentropy": 2.376212000846863, "loss/fcd": 1.671875, "loss/idx": 0.5, "loss/logits": 0.20766521990299225, "step": 126 }, { "epoch": 0.0018963856681026437, "grad_norm": 0.5859375, "grad_norm_var": 0.023524729410807292, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.6014784574508667, "loss/fcd": 1.390625, "loss/idx": 0.5, "loss/logits": 0.1386614888906479, "step": 127 }, { "epoch": 0.0019113178387176251, "grad_norm": 0.828125, "grad_norm_var": 0.02089583079020182, "learning_rate": 0.0001, "loss": 1.5522, "loss/crossentropy": 2.6718069314956665, "loss/fcd": 1.4140625, "loss/idx": 0.5, "loss/logits": 0.13817449286580086, "step": 128 }, { "epoch": 0.0019262500093326065, "grad_norm": 0.7890625, "grad_norm_var": 0.008261553446451823, "learning_rate": 0.0001, "loss": 1.5992, "loss/crossentropy": 2.6475735902786255, "loss/fcd": 1.44921875, "loss/idx": 0.5, "loss/logits": 0.14993533492088318, "step": 129 }, { "epoch": 0.0019411821799475881, "grad_norm": 0.8046875, "grad_norm_var": 0.007806841532389323, "learning_rate": 0.0001, "loss": 1.6639, "loss/crossentropy": 2.257818102836609, "loss/fcd": 1.50390625, "loss/idx": 0.5, "loss/logits": 0.15997718274593353, "step": 130 }, { "epoch": 0.0019561143505625695, "grad_norm": 0.8046875, "grad_norm_var": 0.008366902669270834, "learning_rate": 0.0001, "loss": 1.7219, "loss/crossentropy": 2.6239601373672485, "loss/fcd": 1.55078125, "loss/idx": 0.5, "loss/logits": 0.1711440533399582, "step": 131 }, { "epoch": 0.001971046521177551, "grad_norm": 0.69921875, "grad_norm_var": 0.008141009012858073, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.5232131481170654, "loss/fcd": 1.50390625, "loss/idx": 0.5, "loss/logits": 0.16352446377277374, "step": 132 }, { "epoch": 0.0019859786917925323, "grad_norm": 0.67578125, "grad_norm_var": 0.0067522684733072914, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.3178855180740356, "loss/fcd": 1.39453125, "loss/idx": 0.5, "loss/logits": 0.13720271736383438, "step": 133 }, { "epoch": 0.0020009108624075137, "grad_norm": 27.875, "grad_norm_var": 46.04944636027018, "learning_rate": 0.0001, "loss": 2.6756, "loss/crossentropy": 2.7906564474105835, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.2459193617105484, "step": 134 }, { "epoch": 0.002015843033022495, "grad_norm": 44.0, "grad_norm_var": 153.3034543355306, "learning_rate": 0.0001, "loss": 3.2793, "loss/crossentropy": 2.7848106622695923, "loss/fcd": 3.0078125, "loss/idx": 1.0, "loss/logits": 0.27151423692703247, "step": 135 }, { "epoch": 0.002030775203637477, "grad_norm": 43.75, "grad_norm_var": 243.7684579849243, "learning_rate": 0.0001, "loss": 3.504, "loss/crossentropy": 2.664074659347534, "loss/fcd": 3.1875, "loss/idx": 1.0, "loss/logits": 0.3164883255958557, "step": 136 }, { "epoch": 0.0020457073742524584, "grad_norm": 40.75, "grad_norm_var": 305.90149377187095, "learning_rate": 0.0001, "loss": 3.3873, "loss/crossentropy": 2.8510701656341553, "loss/fcd": 3.09375, "loss/idx": 1.0, "loss/logits": 0.2935274988412857, "step": 137 }, { "epoch": 0.0020606395448674398, "grad_norm": 37.5, "grad_norm_var": 343.3572509129842, "learning_rate": 0.0001, "loss": 3.3702, "loss/crossentropy": 2.5609227418899536, "loss/fcd": 3.0859375, "loss/idx": 1.0, "loss/logits": 0.2842549532651901, "step": 138 }, { "epoch": 0.002075571715482421, "grad_norm": 37.25, "grad_norm_var": 368.9572041193644, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.607009768486023, "loss/fcd": 3.8515625, "loss/idx": 1.0, "loss/logits": 0.6344500631093979, "step": 139 }, { "epoch": 0.0020905038860974025, "grad_norm": 34.25, "grad_norm_var": 375.90857741038, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.759778141975403, "loss/fcd": 2.828125, "loss/idx": 1.0, "loss/logits": 0.24779706448316574, "step": 140 }, { "epoch": 0.002105436056712384, "grad_norm": 34.0, "grad_norm_var": 372.6947629292806, "learning_rate": 0.0001, "loss": 3.3188, "loss/crossentropy": 2.3219879865646362, "loss/fcd": 3.0390625, "loss/idx": 1.0, "loss/logits": 0.27973373234272003, "step": 141 }, { "epoch": 0.0021203682273273653, "grad_norm": 33.25, "grad_norm_var": 359.17601114908854, "learning_rate": 0.0001, "loss": 3.1591, "loss/crossentropy": 2.8131046295166016, "loss/fcd": 2.8984375, "loss/idx": 1.0, "loss/logits": 0.2606983706355095, "step": 142 }, { "epoch": 0.0021353003979423467, "grad_norm": 33.0, "grad_norm_var": 336.12636286417643, "learning_rate": 0.0001, "loss": 3.1873, "loss/crossentropy": 2.613773822784424, "loss/fcd": 2.90625, "loss/idx": 1.0, "loss/logits": 0.2810151129961014, "step": 143 }, { "epoch": 0.002150232568557328, "grad_norm": 28.0, "grad_norm_var": 301.4397661844889, "learning_rate": 0.0001, "loss": 3.1588, "loss/crossentropy": 2.1197726130485535, "loss/fcd": 2.90625, "loss/idx": 1.0, "loss/logits": 0.25257067382335663, "step": 144 }, { "epoch": 0.00216516473917231, "grad_norm": 28.5, "grad_norm_var": 260.5796641031901, "learning_rate": 0.0001, "loss": 2.9352, "loss/crossentropy": 2.5937873125076294, "loss/fcd": 2.703125, "loss/idx": 1.0, "loss/logits": 0.23209355771541595, "step": 145 }, { "epoch": 0.0021800969097872914, "grad_norm": 27.25, "grad_norm_var": 213.44209976196288, "learning_rate": 0.0001, "loss": 3.0205, "loss/crossentropy": 2.4897440671920776, "loss/fcd": 2.765625, "loss/idx": 1.0, "loss/logits": 0.254846952855587, "step": 146 }, { "epoch": 0.0021950290804022728, "grad_norm": 31.625, "grad_norm_var": 160.14161987304686, "learning_rate": 0.0001, "loss": 3.1151, "loss/crossentropy": 2.679656982421875, "loss/fcd": 2.84375, "loss/idx": 1.0, "loss/logits": 0.27131521701812744, "step": 147 }, { "epoch": 0.002209961251017254, "grad_norm": 25.75, "grad_norm_var": 100.99951419830322, "learning_rate": 0.0001, "loss": 2.9074, "loss/crossentropy": 2.7504318952560425, "loss/fcd": 2.671875, "loss/idx": 1.0, "loss/logits": 0.23552851378917694, "step": 148 }, { "epoch": 0.0022248934216322356, "grad_norm": 23.875, "grad_norm_var": 38.628580729166664, "learning_rate": 0.0001, "loss": 2.9145, "loss/crossentropy": 2.801925301551819, "loss/fcd": 2.6640625, "loss/idx": 1.0, "loss/logits": 0.2504773437976837, "step": 149 }, { "epoch": 0.002239825592247217, "grad_norm": 22.625, "grad_norm_var": 44.05358072916667, "learning_rate": 0.0001, "loss": 2.8546, "loss/crossentropy": 2.5748791694641113, "loss/fcd": 2.609375, "loss/idx": 1.0, "loss/logits": 0.24526876956224442, "step": 150 }, { "epoch": 0.0022547577628621984, "grad_norm": 22.25, "grad_norm_var": 41.244205729166666, "learning_rate": 0.0001, "loss": 2.6247, "loss/crossentropy": 2.8942376375198364, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.1949758157134056, "step": 151 }, { "epoch": 0.0022696899334771798, "grad_norm": 23.25, "grad_norm_var": 33.96243489583333, "learning_rate": 0.0001, "loss": 2.8511, "loss/crossentropy": 2.8165948390960693, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.22610026597976685, "step": 152 }, { "epoch": 0.002284622104092161, "grad_norm": 21.5, "grad_norm_var": 30.0322265625, "learning_rate": 0.0001, "loss": 2.8665, "loss/crossentropy": 2.9259716272354126, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.24152649194002151, "step": 153 }, { "epoch": 0.002299554274707143, "grad_norm": 21.5, "grad_norm_var": 27.8822265625, "learning_rate": 0.0001, "loss": 3.0903, "loss/crossentropy": 2.579954981803894, "loss/fcd": 2.78125, "loss/idx": 1.0, "loss/logits": 0.30902038514614105, "step": 154 }, { "epoch": 0.0023144864453221244, "grad_norm": 17.375, "grad_norm_var": 28.0375, "learning_rate": 0.0001, "loss": 2.7236, "loss/crossentropy": 2.7883050441741943, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.21582189947366714, "step": 155 }, { "epoch": 0.0023294186159371058, "grad_norm": 16.25, "grad_norm_var": 30.2875, "learning_rate": 0.0001, "loss": 2.7658, "loss/crossentropy": 2.316504955291748, "loss/fcd": 2.5234375, "loss/idx": 1.0, "loss/logits": 0.24237968027591705, "step": 156 }, { "epoch": 0.002344350786552087, "grad_norm": 15.1875, "grad_norm_var": 31.399593098958334, "learning_rate": 0.0001, "loss": 2.7595, "loss/crossentropy": 2.7337818145751953, "loss/fcd": 2.515625, "loss/idx": 1.0, "loss/logits": 0.24386876821517944, "step": 157 }, { "epoch": 0.0023592829571670686, "grad_norm": 14.3125, "grad_norm_var": 31.591927083333335, "learning_rate": 0.0001, "loss": 2.6083, "loss/crossentropy": 2.8262592554092407, "loss/fcd": 2.40625, "loss/idx": 1.0, "loss/logits": 0.20209631323814392, "step": 158 }, { "epoch": 0.00237421512778205, "grad_norm": 13.5, "grad_norm_var": 30.048177083333332, "learning_rate": 0.0001, "loss": 2.7447, "loss/crossentropy": 2.6527985334396362, "loss/fcd": 2.5, "loss/idx": 1.0, "loss/logits": 0.24472637474536896, "step": 159 }, { "epoch": 0.0023891472983970314, "grad_norm": 13.125, "grad_norm_var": 32.070247395833334, "learning_rate": 0.0001, "loss": 2.453, "loss/crossentropy": 2.714062452316284, "loss/fcd": 2.2578125, "loss/idx": 1.0, "loss/logits": 0.19521619379520416, "step": 160 }, { "epoch": 0.0024040794690120128, "grad_norm": 14.0625, "grad_norm_var": 30.885921223958334, "learning_rate": 0.0001, "loss": 2.7171, "loss/crossentropy": 2.446950674057007, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.20926345884799957, "step": 161 }, { "epoch": 0.0024190116396269946, "grad_norm": 13.5, "grad_norm_var": 29.804541015625, "learning_rate": 0.0001, "loss": 2.9389, "loss/crossentropy": 2.5623066425323486, "loss/fcd": 2.6796875, "loss/idx": 1.0, "loss/logits": 0.25920529663562775, "step": 162 }, { "epoch": 0.002433943810241976, "grad_norm": 13.4375, "grad_norm_var": 20.725, "learning_rate": 0.0001, "loss": 2.5012, "loss/crossentropy": 2.540536642074585, "loss/fcd": 2.2890625, "loss/idx": 1.0, "loss/logits": 0.2121095061302185, "step": 163 }, { "epoch": 0.0024488759808569574, "grad_norm": 14.125, "grad_norm_var": 17.4978515625, "learning_rate": 0.0001, "loss": 2.6137, "loss/crossentropy": 2.8521658182144165, "loss/fcd": 2.375, "loss/idx": 1.0, "loss/logits": 0.2387429103255272, "step": 164 }, { "epoch": 0.0024638081514719388, "grad_norm": 13.625, "grad_norm_var": 15.341080729166666, "learning_rate": 0.0001, "loss": 2.9265, "loss/crossentropy": 2.394273519515991, "loss/fcd": 2.640625, "loss/idx": 1.0, "loss/logits": 0.28584469854831696, "step": 165 }, { "epoch": 0.00247874032208692, "grad_norm": 14.0, "grad_norm_var": 13.351041666666667, "learning_rate": 0.0001, "loss": 2.4902, "loss/crossentropy": 2.939634919166565, "loss/fcd": 2.2734375, "loss/idx": 1.0, "loss/logits": 0.21672210842370987, "step": 166 }, { "epoch": 0.0024936724927019016, "grad_norm": 13.3125, "grad_norm_var": 11.267952473958333, "learning_rate": 0.0001, "loss": 2.5294, "loss/crossentropy": 2.645583152770996, "loss/fcd": 2.296875, "loss/idx": 1.0, "loss/logits": 0.23250159621238708, "step": 167 }, { "epoch": 0.002508604663316883, "grad_norm": 11.9375, "grad_norm_var": 7.959635416666667, "learning_rate": 0.0001, "loss": 2.3236, "loss/crossentropy": 2.652674078941345, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.1908191666007042, "step": 168 }, { "epoch": 0.0025235368339318644, "grad_norm": 10.875, "grad_norm_var": 5.873372395833333, "learning_rate": 0.0001, "loss": 2.4839, "loss/crossentropy": 2.424571990966797, "loss/fcd": 2.234375, "loss/idx": 1.0, "loss/logits": 0.2495182603597641, "step": 169 }, { "epoch": 0.0025384690045468458, "grad_norm": 12.375, "grad_norm_var": 2.418229166666667, "learning_rate": 0.0001, "loss": 2.5325, "loss/crossentropy": 2.4959967136383057, "loss/fcd": 2.3046875, "loss/idx": 1.0, "loss/logits": 0.22784889489412308, "step": 170 }, { "epoch": 0.0025534011751618276, "grad_norm": 12.0625, "grad_norm_var": 1.6587076822916667, "learning_rate": 0.0001, "loss": 2.4315, "loss/crossentropy": 2.6240097284317017, "loss/fcd": 2.21875, "loss/idx": 1.0, "loss/logits": 0.21272272616624832, "step": 171 }, { "epoch": 0.002568333345776809, "grad_norm": 12.5, "grad_norm_var": 1.1528483072916667, "learning_rate": 0.0001, "loss": 2.6707, "loss/crossentropy": 2.7601370811462402, "loss/fcd": 2.3984375, "loss/idx": 1.0, "loss/logits": 0.27226200699806213, "step": 172 }, { "epoch": 0.0025832655163917904, "grad_norm": 12.5625, "grad_norm_var": 0.9040201822916667, "learning_rate": 0.0001, "loss": 2.596, "loss/crossentropy": 2.7772231101989746, "loss/fcd": 2.3671875, "loss/idx": 1.0, "loss/logits": 0.22885487973690033, "step": 173 }, { "epoch": 0.002598197687006772, "grad_norm": 13.5625, "grad_norm_var": 0.8161295572916667, "learning_rate": 0.0001, "loss": 2.905, "loss/crossentropy": 2.641968846321106, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.2800135463476181, "step": 174 }, { "epoch": 0.002613129857621753, "grad_norm": 12.6875, "grad_norm_var": 0.80703125, "learning_rate": 0.0001, "loss": 2.9335, "loss/crossentropy": 2.4210065603256226, "loss/fcd": 2.6640625, "loss/idx": 1.0, "loss/logits": 0.2694525122642517, "step": 175 }, { "epoch": 0.0026280620282367346, "grad_norm": 12.5, "grad_norm_var": 0.8197265625, "learning_rate": 0.0001, "loss": 2.8887, "loss/crossentropy": 2.781251311302185, "loss/fcd": 2.6015625, "loss/idx": 1.0, "loss/logits": 0.28714829683303833, "step": 176 }, { "epoch": 0.002642994198851716, "grad_norm": 11.0, "grad_norm_var": 0.9497233072916667, "learning_rate": 0.0001, "loss": 2.9995, "loss/crossentropy": 2.67316734790802, "loss/fcd": 2.703125, "loss/idx": 1.0, "loss/logits": 0.29639333486557007, "step": 177 }, { "epoch": 0.0026579263694666974, "grad_norm": 10.375, "grad_norm_var": 1.2492024739583334, "learning_rate": 0.0001, "loss": 2.7215, "loss/crossentropy": 2.549267888069153, "loss/fcd": 2.4765625, "loss/idx": 1.0, "loss/logits": 0.2449359893798828, "step": 178 }, { "epoch": 0.0026728585400816788, "grad_norm": 10.5625, "grad_norm_var": 1.4288899739583334, "learning_rate": 0.0001, "loss": 2.7281, "loss/crossentropy": 2.6656607389450073, "loss/fcd": 2.4921875, "loss/idx": 1.0, "loss/logits": 0.23593086749315262, "step": 179 }, { "epoch": 0.0026877907106966606, "grad_norm": 9.0625, "grad_norm_var": 1.8520833333333333, "learning_rate": 0.0001, "loss": 2.8713, "loss/crossentropy": 2.4903770685195923, "loss/fcd": 2.59375, "loss/idx": 1.0, "loss/logits": 0.2775098979473114, "step": 180 }, { "epoch": 0.002702722881311642, "grad_norm": 8.3125, "grad_norm_var": 2.509228515625, "learning_rate": 0.0001, "loss": 2.7875, "loss/crossentropy": 2.9355252981185913, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.2797327786684036, "step": 181 }, { "epoch": 0.0027176550519266234, "grad_norm": 8.5625, "grad_norm_var": 2.71171875, "learning_rate": 0.0001, "loss": 2.6506, "loss/crossentropy": 2.7791460752487183, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.22087856382131577, "step": 182 }, { "epoch": 0.002732587222541605, "grad_norm": 7.5625, "grad_norm_var": 3.3046875, "learning_rate": 0.0001, "loss": 2.7138, "loss/crossentropy": 2.7682063579559326, "loss/fcd": 2.46875, "loss/idx": 1.0, "loss/logits": 0.24506456404924393, "step": 183 }, { "epoch": 0.002747519393156586, "grad_norm": 7.15625, "grad_norm_var": 4.15572509765625, "learning_rate": 0.0001, "loss": 2.5292, "loss/crossentropy": 2.508063316345215, "loss/fcd": 2.3046875, "loss/idx": 1.0, "loss/logits": 0.2244826927781105, "step": 184 }, { "epoch": 0.0027624515637715676, "grad_norm": 7.53125, "grad_norm_var": 4.790950520833333, "learning_rate": 0.0001, "loss": 2.6223, "loss/crossentropy": 2.9126468896865845, "loss/fcd": 2.3828125, "loss/idx": 1.0, "loss/logits": 0.23952852189540863, "step": 185 }, { "epoch": 0.002777383734386549, "grad_norm": 7.0, "grad_norm_var": 5.269661458333333, "learning_rate": 0.0001, "loss": 2.615, "loss/crossentropy": 2.417146325111389, "loss/fcd": 2.390625, "loss/idx": 1.0, "loss/logits": 0.22438553720712662, "step": 186 }, { "epoch": 0.0027923159050015304, "grad_norm": 6.5625, "grad_norm_var": 5.785286458333333, "learning_rate": 0.0001, "loss": 2.5657, "loss/crossentropy": 2.8265384435653687, "loss/fcd": 2.3359375, "loss/idx": 1.0, "loss/logits": 0.2297244518995285, "step": 187 }, { "epoch": 0.002807248075616512, "grad_norm": 6.71875, "grad_norm_var": 5.826688639322916, "learning_rate": 0.0001, "loss": 2.4346, "loss/crossentropy": 2.577815532684326, "loss/fcd": 2.2265625, "loss/idx": 1.0, "loss/logits": 0.20803897082805634, "step": 188 }, { "epoch": 0.0028221802462314936, "grad_norm": 6.25, "grad_norm_var": 5.72476806640625, "learning_rate": 0.0001, "loss": 2.5743, "loss/crossentropy": 2.7740859985351562, "loss/fcd": 2.328125, "loss/idx": 1.0, "loss/logits": 0.24621784687042236, "step": 189 }, { "epoch": 0.002837112416846475, "grad_norm": 7.78125, "grad_norm_var": 4.364518229166666, "learning_rate": 0.0001, "loss": 3.1307, "loss/crossentropy": 2.6582623720169067, "loss/fcd": 2.8203125, "loss/idx": 1.0, "loss/logits": 0.3103819936513901, "step": 190 }, { "epoch": 0.0028520445874614564, "grad_norm": 5.375, "grad_norm_var": 3.8446451822916665, "learning_rate": 0.0001, "loss": 2.3101, "loss/crossentropy": 2.8031907081604004, "loss/fcd": 2.109375, "loss/idx": 1.0, "loss/logits": 0.20073574036359787, "step": 191 }, { "epoch": 0.002866976758076438, "grad_norm": 5.53125, "grad_norm_var": 2.949051920572917, "learning_rate": 0.0001, "loss": 2.4215, "loss/crossentropy": 2.6993457078933716, "loss/fcd": 2.21875, "loss/idx": 1.0, "loss/logits": 0.20273278653621674, "step": 192 }, { "epoch": 0.002881908928691419, "grad_norm": 5.21875, "grad_norm_var": 2.597509765625, "learning_rate": 0.0001, "loss": 2.357, "loss/crossentropy": 2.544357180595398, "loss/fcd": 2.1484375, "loss/idx": 1.0, "loss/logits": 0.20859003067016602, "step": 193 }, { "epoch": 0.0028968410993064006, "grad_norm": 6.125, "grad_norm_var": 2.0817545572916667, "learning_rate": 0.0001, "loss": 2.4965, "loss/crossentropy": 2.489700198173523, "loss/fcd": 2.265625, "loss/idx": 1.0, "loss/logits": 0.23092350363731384, "step": 194 }, { "epoch": 0.002911773269921382, "grad_norm": 4.8125, "grad_norm_var": 1.575634765625, "learning_rate": 0.0001, "loss": 2.3381, "loss/crossentropy": 2.7617045640945435, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.20526950061321259, "step": 195 }, { "epoch": 0.0029267054405363634, "grad_norm": 4.71875, "grad_norm_var": 1.47213134765625, "learning_rate": 0.0001, "loss": 2.2927, "loss/crossentropy": 2.5566166639328003, "loss/fcd": 2.09375, "loss/idx": 1.0, "loss/logits": 0.1989428475499153, "step": 196 }, { "epoch": 0.0029416376111513452, "grad_norm": 4.40625, "grad_norm_var": 1.5214680989583333, "learning_rate": 0.0001, "loss": 2.3069, "loss/crossentropy": 2.708446979522705, "loss/fcd": 2.09375, "loss/idx": 1.0, "loss/logits": 0.21319883316755295, "step": 197 }, { "epoch": 0.0029565697817663266, "grad_norm": 4.59375, "grad_norm_var": 1.3256144205729166, "learning_rate": 0.0001, "loss": 2.2289, "loss/crossentropy": 2.3388549089431763, "loss/fcd": 2.0390625, "loss/idx": 1.0, "loss/logits": 0.18980170786380768, "step": 198 }, { "epoch": 0.002971501952381308, "grad_norm": 4.15625, "grad_norm_var": 1.3792805989583334, "learning_rate": 0.0001, "loss": 2.2524, "loss/crossentropy": 2.5469167232513428, "loss/fcd": 2.0546875, "loss/idx": 1.0, "loss/logits": 0.19768796861171722, "step": 199 }, { "epoch": 0.0029864341229962894, "grad_norm": 3.921875, "grad_norm_var": 1.4788808186848958, "learning_rate": 0.0001, "loss": 2.1652, "loss/crossentropy": 2.4613125324249268, "loss/fcd": 1.984375, "loss/idx": 1.0, "loss/logits": 0.18085117638111115, "step": 200 }, { "epoch": 0.003001366293611271, "grad_norm": 4.15625, "grad_norm_var": 1.3527577718098958, "learning_rate": 0.0001, "loss": 2.2654, "loss/crossentropy": 2.6382863521575928, "loss/fcd": 2.0546875, "loss/idx": 1.0, "loss/logits": 0.21074112504720688, "step": 201 }, { "epoch": 0.003016298464226252, "grad_norm": 3.78125, "grad_norm_var": 1.338508097330729, "learning_rate": 0.0001, "loss": 2.3743, "loss/crossentropy": 2.652292013168335, "loss/fcd": 2.14453125, "loss/idx": 1.0, "loss/logits": 0.229776993393898, "step": 202 }, { "epoch": 0.0030312306348412336, "grad_norm": 3.40625, "grad_norm_var": 1.4116607666015626, "learning_rate": 0.0001, "loss": 2.236, "loss/crossentropy": 2.683787226676941, "loss/fcd": 2.03125, "loss/idx": 1.0, "loss/logits": 0.20470323413610458, "step": 203 }, { "epoch": 0.003046162805456215, "grad_norm": 3.6875, "grad_norm_var": 1.3153554280598958, "learning_rate": 0.0001, "loss": 2.0852, "loss/crossentropy": 2.662025213241577, "loss/fcd": 1.921875, "loss/idx": 1.0, "loss/logits": 0.16330592334270477, "step": 204 }, { "epoch": 0.0030610949760711964, "grad_norm": 4.0625, "grad_norm_var": 1.2119618733723958, "learning_rate": 0.0001, "loss": 2.6026, "loss/crossentropy": 2.5425872802734375, "loss/fcd": 2.3359375, "loss/idx": 1.0, "loss/logits": 0.26663239300251007, "step": 205 }, { "epoch": 0.0030760271466861782, "grad_norm": 4.21875, "grad_norm_var": 0.5574452718098958, "learning_rate": 0.0001, "loss": 2.2165, "loss/crossentropy": 2.7666549682617188, "loss/fcd": 2.03125, "loss/idx": 1.0, "loss/logits": 0.1852124035358429, "step": 206 }, { "epoch": 0.0030909593173011596, "grad_norm": 3.515625, "grad_norm_var": 0.5592610677083333, "learning_rate": 0.0001, "loss": 2.1929, "loss/crossentropy": 2.853086471557617, "loss/fcd": 2.0078125, "loss/idx": 1.0, "loss/logits": 0.18510984629392624, "step": 207 }, { "epoch": 0.003105891487916141, "grad_norm": 3.03125, "grad_norm_var": 0.5709798177083333, "learning_rate": 0.0001, "loss": 2.3248, "loss/crossentropy": 2.38408362865448, "loss/fcd": 2.125, "loss/idx": 1.0, "loss/logits": 0.1997532695531845, "step": 208 }, { "epoch": 0.0031208236585311224, "grad_norm": 2.59375, "grad_norm_var": 0.6584798177083333, "learning_rate": 0.0001, "loss": 2.2643, "loss/crossentropy": 2.7543792724609375, "loss/fcd": 2.04296875, "loss/idx": 1.0, "loss/logits": 0.22130031883716583, "step": 209 }, { "epoch": 0.003135755829146104, "grad_norm": 5.4375, "grad_norm_var": 0.5000325520833333, "learning_rate": 0.0001, "loss": 2.1592, "loss/crossentropy": 2.7001391649246216, "loss/fcd": 1.9765625, "loss/idx": 1.0, "loss/logits": 0.1826585754752159, "step": 210 }, { "epoch": 0.0031506879997610852, "grad_norm": 4.5, "grad_norm_var": 0.473583984375, "learning_rate": 0.0001, "loss": 2.1441, "loss/crossentropy": 2.70257830619812, "loss/fcd": 1.96875, "loss/idx": 1.0, "loss/logits": 0.17531797289848328, "step": 211 }, { "epoch": 0.0031656201703760666, "grad_norm": 2.609375, "grad_norm_var": 0.5528228759765625, "learning_rate": 0.0001, "loss": 2.0612, "loss/crossentropy": 2.393709421157837, "loss/fcd": 1.890625, "loss/idx": 1.0, "loss/logits": 0.170525424182415, "step": 212 }, { "epoch": 0.003180552340991048, "grad_norm": 4.28125, "grad_norm_var": 0.5450266520182292, "learning_rate": 0.0001, "loss": 2.6544, "loss/crossentropy": 2.5453147888183594, "loss/fcd": 2.375, "loss/idx": 1.0, "loss/logits": 0.27943994104862213, "step": 213 }, { "epoch": 0.0031954845116060294, "grad_norm": 2.125, "grad_norm_var": 0.6883941650390625, "learning_rate": 0.0001, "loss": 2.067, "loss/crossentropy": 2.6982057094573975, "loss/fcd": 1.88671875, "loss/idx": 1.0, "loss/logits": 0.1802719309926033, "step": 214 }, { "epoch": 0.0032104166822210112, "grad_norm": 2.78125, "grad_norm_var": 0.7261708577473959, "learning_rate": 0.0001, "loss": 2.0135, "loss/crossentropy": 2.5155314207077026, "loss/fcd": 1.86328125, "loss/idx": 1.0, "loss/logits": 0.15022382885217667, "step": 215 }, { "epoch": 0.0032253488528359926, "grad_norm": 2.65625, "grad_norm_var": 0.7773396809895833, "learning_rate": 0.0001, "loss": 2.2039, "loss/crossentropy": 2.611793875694275, "loss/fcd": 1.99609375, "loss/idx": 1.0, "loss/logits": 0.20781183242797852, "step": 216 }, { "epoch": 0.003240281023450974, "grad_norm": 2.15625, "grad_norm_var": 0.8664021809895833, "learning_rate": 0.0001, "loss": 1.9168, "loss/crossentropy": 2.9469950199127197, "loss/fcd": 1.765625, "loss/idx": 1.0, "loss/logits": 0.15117117017507553, "step": 217 }, { "epoch": 0.0032552131940659554, "grad_norm": 1.96875, "grad_norm_var": 0.98629150390625, "learning_rate": 0.0001, "loss": 2.033, "loss/crossentropy": 2.321299910545349, "loss/fcd": 1.859375, "loss/idx": 1.0, "loss/logits": 0.1736186519265175, "step": 218 }, { "epoch": 0.003270145364680937, "grad_norm": 2.9375, "grad_norm_var": 0.994287109375, "learning_rate": 0.0001, "loss": 2.3963, "loss/crossentropy": 2.4344794750213623, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.26349252462387085, "step": 219 }, { "epoch": 0.0032850775352959182, "grad_norm": 1.9296875, "grad_norm_var": 1.0931068420410157, "learning_rate": 0.0001, "loss": 2.1104, "loss/crossentropy": 2.6011242866516113, "loss/fcd": 1.91796875, "loss/idx": 1.0, "loss/logits": 0.1924804523587227, "step": 220 }, { "epoch": 0.0033000097059108996, "grad_norm": 1.859375, "grad_norm_var": 1.1358497619628907, "learning_rate": 0.0001, "loss": 2.0349, "loss/crossentropy": 2.6590973138809204, "loss/fcd": 1.86328125, "loss/idx": 1.0, "loss/logits": 0.17158202826976776, "step": 221 }, { "epoch": 0.003314941876525881, "grad_norm": 1.828125, "grad_norm_var": 1.1165504455566406, "learning_rate": 0.0001, "loss": 1.9973, "loss/crossentropy": 2.5507930517196655, "loss/fcd": 1.828125, "loss/idx": 1.0, "loss/logits": 0.16922374814748764, "step": 222 }, { "epoch": 0.0033298740471408624, "grad_norm": 4.21875, "grad_norm_var": 1.2062721252441406, "learning_rate": 0.0001, "loss": 2.1822, "loss/crossentropy": 2.44870126247406, "loss/fcd": 2.0078125, "loss/idx": 1.0, "loss/logits": 0.17442134022712708, "step": 223 }, { "epoch": 0.0033448062177558443, "grad_norm": 2.328125, "grad_norm_var": 1.2278785705566406, "learning_rate": 0.0001, "loss": 2.0348, "loss/crossentropy": 2.673762083053589, "loss/fcd": 1.8515625, "loss/idx": 1.0, "loss/logits": 0.18328066915273666, "step": 224 }, { "epoch": 0.0033597383883708257, "grad_norm": 1.6875, "grad_norm_var": 1.3147865295410157, "learning_rate": 0.0001, "loss": 1.8397, "loss/crossentropy": 2.640251398086548, "loss/fcd": 1.70703125, "loss/idx": 1.0, "loss/logits": 0.13263830170035362, "step": 225 }, { "epoch": 0.003374670558985807, "grad_norm": 2.953125, "grad_norm_var": 0.8373207092285156, "learning_rate": 0.0001, "loss": 2.2305, "loss/crossentropy": 2.267286777496338, "loss/fcd": 2.03515625, "loss/idx": 1.0, "loss/logits": 0.19532842934131622, "step": 226 }, { "epoch": 0.0033896027296007884, "grad_norm": 1.625, "grad_norm_var": 0.6548255920410156, "learning_rate": 0.0001, "loss": 1.9428, "loss/crossentropy": 2.665201783180237, "loss/fcd": 1.77734375, "loss/idx": 1.0, "loss/logits": 0.1654854491353035, "step": 227 }, { "epoch": 0.00340453490021577, "grad_norm": 2.125, "grad_norm_var": 0.6622047424316406, "learning_rate": 0.0001, "loss": 2.2125, "loss/crossentropy": 2.5612581968307495, "loss/fcd": 2.0234375, "loss/idx": 1.0, "loss/logits": 0.18906734883785248, "step": 228 }, { "epoch": 0.0034194670708307512, "grad_norm": 1.9765625, "grad_norm_var": 0.43646240234375, "learning_rate": 0.0001, "loss": 2.3429, "loss/crossentropy": 2.2903116941452026, "loss/fcd": 2.1015625, "loss/idx": 1.0, "loss/logits": 0.24134419858455658, "step": 229 }, { "epoch": 0.0034343992414457326, "grad_norm": 1.7109375, "grad_norm_var": 0.45806859334309896, "learning_rate": 0.0001, "loss": 2.0235, "loss/crossentropy": 2.728991985321045, "loss/fcd": 1.84375, "loss/idx": 1.0, "loss/logits": 0.1797611489892006, "step": 230 }, { "epoch": 0.003449331412060714, "grad_norm": 2.140625, "grad_norm_var": 0.44230321248372395, "learning_rate": 0.0001, "loss": 2.0665, "loss/crossentropy": 2.8149020671844482, "loss/fcd": 1.87890625, "loss/idx": 1.0, "loss/logits": 0.1875438541173935, "step": 231 }, { "epoch": 0.003464263582675696, "grad_norm": 2.84375, "grad_norm_var": 0.45449803670247396, "learning_rate": 0.0001, "loss": 2.253, "loss/crossentropy": 2.5558494329452515, "loss/fcd": 2.0390625, "loss/idx": 1.0, "loss/logits": 0.2139800414443016, "step": 232 }, { "epoch": 0.0034791957532906773, "grad_norm": 1.7421875, "grad_norm_var": 0.47138671875, "learning_rate": 0.0001, "loss": 1.9924, "loss/crossentropy": 2.5154411792755127, "loss/fcd": 1.8203125, "loss/idx": 1.0, "loss/logits": 0.17209748923778534, "step": 233 }, { "epoch": 0.0034941279239056587, "grad_norm": 1.6171875, "grad_norm_var": 0.4919288635253906, "learning_rate": 0.0001, "loss": 1.9748, "loss/crossentropy": 2.405826687812805, "loss/fcd": 1.796875, "loss/idx": 1.0, "loss/logits": 0.17791152000427246, "step": 234 }, { "epoch": 0.00350906009452064, "grad_norm": 1.8359375, "grad_norm_var": 0.4624176025390625, "learning_rate": 0.0001, "loss": 2.0062, "loss/crossentropy": 2.6848820447921753, "loss/fcd": 1.82421875, "loss/idx": 1.0, "loss/logits": 0.18202318251132965, "step": 235 }, { "epoch": 0.0035239922651356215, "grad_norm": 1.6640625, "grad_norm_var": 0.4746785481770833, "learning_rate": 0.0001, "loss": 2.138, "loss/crossentropy": 2.6112011671066284, "loss/fcd": 1.93359375, "loss/idx": 1.0, "loss/logits": 0.20442968606948853, "step": 236 }, { "epoch": 0.003538924435750603, "grad_norm": 1.5, "grad_norm_var": 0.4959462483723958, "learning_rate": 0.0001, "loss": 2.1163, "loss/crossentropy": 2.3305634260177612, "loss/fcd": 1.93359375, "loss/idx": 1.0, "loss/logits": 0.1827048435807228, "step": 237 }, { "epoch": 0.0035538566063655842, "grad_norm": 1.40625, "grad_norm_var": 0.5230550130208333, "learning_rate": 0.0001, "loss": 1.9209, "loss/crossentropy": 2.7014966011047363, "loss/fcd": 1.76171875, "loss/idx": 1.0, "loss/logits": 0.15916066616773605, "step": 238 }, { "epoch": 0.0035687887769805656, "grad_norm": 2.671875, "grad_norm_var": 0.23271382649739583, "learning_rate": 0.0001, "loss": 2.1395, "loss/crossentropy": 2.541683793067932, "loss/fcd": 1.91796875, "loss/idx": 1.0, "loss/logits": 0.22150883078575134, "step": 239 }, { "epoch": 0.003583720947595547, "grad_norm": 1.375, "grad_norm_var": 0.24642740885416667, "learning_rate": 0.0001, "loss": 1.9673, "loss/crossentropy": 2.613878011703491, "loss/fcd": 1.796875, "loss/idx": 1.0, "loss/logits": 0.17040642350912094, "step": 240 }, { "epoch": 0.003598653118210529, "grad_norm": 1.6875, "grad_norm_var": 0.24642740885416667, "learning_rate": 0.0001, "loss": 1.8668, "loss/crossentropy": 2.526310443878174, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.15190982818603516, "step": 241 }, { "epoch": 0.0036135852888255103, "grad_norm": 1.4453125, "grad_norm_var": 0.18276748657226563, "learning_rate": 0.0001, "loss": 1.8594, "loss/crossentropy": 2.6557728052139282, "loss/fcd": 1.703125, "loss/idx": 1.0, "loss/logits": 0.1562422439455986, "step": 242 }, { "epoch": 0.0036285174594404917, "grad_norm": 1.6796875, "grad_norm_var": 0.18141988118489583, "learning_rate": 0.0001, "loss": 2.1091, "loss/crossentropy": 2.42824125289917, "loss/fcd": 1.91015625, "loss/idx": 1.0, "loss/logits": 0.19891205430030823, "step": 243 }, { "epoch": 0.003643449630055473, "grad_norm": 1.453125, "grad_norm_var": 0.18400065104166666, "learning_rate": 0.0001, "loss": 2.0081, "loss/crossentropy": 2.31497859954834, "loss/fcd": 1.8359375, "loss/idx": 1.0, "loss/logits": 0.17215853184461594, "step": 244 }, { "epoch": 0.0036583818006704545, "grad_norm": 1.4375, "grad_norm_var": 0.1892473856608073, "learning_rate": 0.0001, "loss": 1.8746, "loss/crossentropy": 2.573368787765503, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.15973138809204102, "step": 245 }, { "epoch": 0.003673313971285436, "grad_norm": 1.3515625, "grad_norm_var": 0.19982274373372397, "learning_rate": 0.0001, "loss": 1.8213, "loss/crossentropy": 2.602314829826355, "loss/fcd": 1.67578125, "loss/idx": 1.0, "loss/logits": 0.14549748599529266, "step": 246 }, { "epoch": 0.0036882461419004173, "grad_norm": 1.3515625, "grad_norm_var": 0.19666341145833333, "learning_rate": 0.0001, "loss": 1.8882, "loss/crossentropy": 2.596967101097107, "loss/fcd": 1.73046875, "loss/idx": 1.0, "loss/logits": 0.1577165126800537, "step": 247 }, { "epoch": 0.0037031783125153987, "grad_norm": 1.5, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 1.9983, "loss/crossentropy": 2.6739336252212524, "loss/fcd": 1.8203125, "loss/idx": 1.0, "loss/logits": 0.17800860106945038, "step": 248 }, { "epoch": 0.00371811048313038, "grad_norm": 1.1796875, "grad_norm_var": 0.11272379557291666, "learning_rate": 0.0001, "loss": 1.7476, "loss/crossentropy": 2.737492322921753, "loss/fcd": 1.609375, "loss/idx": 1.0, "loss/logits": 0.13822893053293228, "step": 249 }, { "epoch": 0.003733042653745362, "grad_norm": 1.8671875, "grad_norm_var": 0.11812744140625, "learning_rate": 0.0001, "loss": 1.7779, "loss/crossentropy": 2.7095056772232056, "loss/fcd": 1.63671875, "loss/idx": 1.0, "loss/logits": 0.1412084773182869, "step": 250 }, { "epoch": 0.0037479748243603433, "grad_norm": 1.109375, "grad_norm_var": 0.12709121704101561, "learning_rate": 0.0001, "loss": 1.7714, "loss/crossentropy": 2.606199264526367, "loss/fcd": 1.640625, "loss/idx": 1.0, "loss/logits": 0.1308179721236229, "step": 251 }, { "epoch": 0.0037629069949753247, "grad_norm": 1.671875, "grad_norm_var": 0.1272216796875, "learning_rate": 0.0001, "loss": 1.8573, "loss/crossentropy": 2.528549313545227, "loss/fcd": 1.69140625, "loss/idx": 1.0, "loss/logits": 0.1659131497144699, "step": 252 }, { "epoch": 0.003777839165590306, "grad_norm": 1.40625, "grad_norm_var": 0.12830810546875, "learning_rate": 0.0001, "loss": 1.8859, "loss/crossentropy": 2.6335842609405518, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.1710173338651657, "step": 253 }, { "epoch": 0.0037927713362052875, "grad_norm": 1.1171875, "grad_norm_var": 0.13857396443684897, "learning_rate": 0.0001, "loss": 1.858, "loss/crossentropy": 2.474130392074585, "loss/fcd": 1.69921875, "loss/idx": 1.0, "loss/logits": 0.15880661457777023, "step": 254 }, { "epoch": 0.003807703506820269, "grad_norm": 1.2109375, "grad_norm_var": 0.047408040364583334, "learning_rate": 0.0001, "loss": 1.8397, "loss/crossentropy": 2.475555896759033, "loss/fcd": 1.6875, "loss/idx": 1.0, "loss/logits": 0.15221639722585678, "step": 255 }, { "epoch": 0.0038226356774352503, "grad_norm": 1.3984375, "grad_norm_var": 0.047277577718098956, "learning_rate": 0.0001, "loss": 2.0511, "loss/crossentropy": 2.3949726819992065, "loss/fcd": 1.87890625, "loss/idx": 1.0, "loss/logits": 0.17222578823566437, "step": 256 }, { "epoch": 0.0038375678480502317, "grad_norm": 1.1015625, "grad_norm_var": 0.04855550130208333, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.4693208932876587, "loss/fcd": 1.6015625, "loss/idx": 1.0, "loss/logits": 0.1387447491288185, "step": 257 }, { "epoch": 0.003852500018665213, "grad_norm": 1.1875, "grad_norm_var": 0.05089696248372396, "learning_rate": 0.0001, "loss": 1.8965, "loss/crossentropy": 2.524168014526367, "loss/fcd": 1.734375, "loss/idx": 1.0, "loss/logits": 0.1620769277215004, "step": 258 }, { "epoch": 0.003867432189280195, "grad_norm": 1.5078125, "grad_norm_var": 0.04579442342122396, "learning_rate": 0.0001, "loss": 1.7984, "loss/crossentropy": 2.66902232170105, "loss/fcd": 1.65625, "loss/idx": 1.0, "loss/logits": 0.1421596258878708, "step": 259 }, { "epoch": 0.0038823643598951763, "grad_norm": 1.4609375, "grad_norm_var": 0.0458892822265625, "learning_rate": 0.0001, "loss": 1.887, "loss/crossentropy": 2.543527603149414, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.17219997197389603, "step": 260 }, { "epoch": 0.0038972965305101577, "grad_norm": 1.3046875, "grad_norm_var": 0.04572931925455729, "learning_rate": 0.0001, "loss": 2.0272, "loss/crossentropy": 2.4247206449508667, "loss/fcd": 1.83203125, "loss/idx": 1.0, "loss/logits": 0.195211723446846, "step": 261 }, { "epoch": 0.003912228701125139, "grad_norm": 1.296875, "grad_norm_var": 0.0459625244140625, "learning_rate": 0.0001, "loss": 1.7184, "loss/crossentropy": 2.844157338142395, "loss/fcd": 1.578125, "loss/idx": 1.0, "loss/logits": 0.14025697112083435, "step": 262 }, { "epoch": 0.003927160871740121, "grad_norm": 1.4765625, "grad_norm_var": 0.0468902587890625, "learning_rate": 0.0001, "loss": 1.8431, "loss/crossentropy": 2.630376935005188, "loss/fcd": 1.68359375, "loss/idx": 1.0, "loss/logits": 0.15945688635110855, "step": 263 }, { "epoch": 0.003942093042355102, "grad_norm": 1.2109375, "grad_norm_var": 0.04680557250976562, "learning_rate": 0.0001, "loss": 1.6698, "loss/crossentropy": 2.665648579597473, "loss/fcd": 1.52734375, "loss/idx": 1.5, "loss/logits": 0.14248844236135483, "step": 264 }, { "epoch": 0.003957025212970084, "grad_norm": 0.87890625, "grad_norm_var": 0.05905907948811849, "learning_rate": 0.0001, "loss": 1.66, "loss/crossentropy": 2.4616761207580566, "loss/fcd": 1.51953125, "loss/idx": 1.5, "loss/logits": 0.1404593661427498, "step": 265 }, { "epoch": 0.003971957383585065, "grad_norm": 1.0546875, "grad_norm_var": 0.04162947336832682, "learning_rate": 0.0001, "loss": 1.7816, "loss/crossentropy": 2.717252254486084, "loss/fcd": 1.58984375, "loss/idx": 1.5, "loss/logits": 0.19176837801933289, "step": 266 }, { "epoch": 0.0039868895542000465, "grad_norm": 0.8515625, "grad_norm_var": 0.051465288798014326, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.7635334730148315, "loss/fcd": 1.5078125, "loss/idx": 1.5, "loss/logits": 0.14682532101869583, "step": 267 }, { "epoch": 0.0040018217248150275, "grad_norm": 1.0234375, "grad_norm_var": 0.04200890858968099, "learning_rate": 0.0001, "loss": 1.8043, "loss/crossentropy": 2.5384750366210938, "loss/fcd": 1.640625, "loss/idx": 1.5, "loss/logits": 0.1637115702033043, "step": 268 }, { "epoch": 0.004016753895430009, "grad_norm": 0.87109375, "grad_norm_var": 0.046477254231770834, "learning_rate": 0.0001, "loss": 1.7611, "loss/crossentropy": 2.486850619316101, "loss/fcd": 1.60546875, "loss/idx": 1.5, "loss/logits": 0.15567568689584732, "step": 269 }, { "epoch": 0.00403168606604499, "grad_norm": 0.9140625, "grad_norm_var": 0.05088094075520833, "learning_rate": 0.0001, "loss": 1.7084, "loss/crossentropy": 2.5260289907455444, "loss/fcd": 1.546875, "loss/idx": 1.5, "loss/logits": 0.16151602566242218, "step": 270 }, { "epoch": 0.004046618236659972, "grad_norm": 1.125, "grad_norm_var": 0.05089492797851562, "learning_rate": 0.0001, "loss": 1.7167, "loss/crossentropy": 2.548484683036804, "loss/fcd": 1.5546875, "loss/idx": 1.5, "loss/logits": 0.16197797656059265, "step": 271 }, { "epoch": 0.004061550407274954, "grad_norm": 0.98046875, "grad_norm_var": 0.048888079325358075, "learning_rate": 0.0001, "loss": 1.6865, "loss/crossentropy": 2.499767541885376, "loss/fcd": 1.53515625, "loss/idx": 1.5, "loss/logits": 0.1513931304216385, "step": 272 }, { "epoch": 0.004076482577889935, "grad_norm": 0.9609375, "grad_norm_var": 0.05085188547770182, "learning_rate": 0.0001, "loss": 1.7289, "loss/crossentropy": 2.458220362663269, "loss/fcd": 1.5625, "loss/idx": 1.5, "loss/logits": 0.16639846563339233, "step": 273 }, { "epoch": 0.004091414748504917, "grad_norm": 0.88671875, "grad_norm_var": 0.054264068603515625, "learning_rate": 0.0001, "loss": 1.7724, "loss/crossentropy": 2.7474247217178345, "loss/fcd": 1.60546875, "loss/idx": 1.5, "loss/logits": 0.16692470759153366, "step": 274 }, { "epoch": 0.004106346919119898, "grad_norm": 1.640625, "grad_norm_var": 0.062361653645833334, "learning_rate": 0.0001, "loss": 1.8246, "loss/crossentropy": 2.3119794130325317, "loss/fcd": 1.6484375, "loss/idx": 1.5, "loss/logits": 0.17614249885082245, "step": 275 }, { "epoch": 0.0041212790897348795, "grad_norm": 0.94921875, "grad_norm_var": 0.05554040273030599, "learning_rate": 0.0001, "loss": 1.767, "loss/crossentropy": 2.567260980606079, "loss/fcd": 1.59375, "loss/idx": 1.5, "loss/logits": 0.17328765988349915, "step": 276 }, { "epoch": 0.0041362112603498605, "grad_norm": 0.9453125, "grad_norm_var": 0.053282610575358075, "learning_rate": 0.0001, "loss": 1.6422, "loss/crossentropy": 2.7794957160949707, "loss/fcd": 1.4921875, "loss/idx": 1.5, "loss/logits": 0.14996777474880219, "step": 277 }, { "epoch": 0.004151143430964842, "grad_norm": 0.96484375, "grad_norm_var": 0.04998067220052083, "learning_rate": 0.0001, "loss": 1.645, "loss/crossentropy": 2.632931113243103, "loss/fcd": 1.5, "loss/idx": 1.5, "loss/logits": 0.14498791843652725, "step": 278 }, { "epoch": 0.004166075601579823, "grad_norm": 0.91796875, "grad_norm_var": 0.03740685780843099, "learning_rate": 0.0001, "loss": 1.6941, "loss/crossentropy": 2.5799747705459595, "loss/fcd": 1.53125, "loss/idx": 1.5, "loss/logits": 0.1628100574016571, "step": 279 }, { "epoch": 0.004181007772194805, "grad_norm": 0.8671875, "grad_norm_var": 0.03562768300374349, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.750143885612488, "loss/fcd": 1.53125, "loss/idx": 1.5, "loss/logits": 0.15750249475240707, "step": 280 }, { "epoch": 0.004195939942809787, "grad_norm": 0.8359375, "grad_norm_var": 0.036376698811848955, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.39098060131073, "loss/fcd": 1.421875, "loss/idx": 1.5, "loss/logits": 0.13189761340618134, "step": 281 }, { "epoch": 0.004210872113424768, "grad_norm": 0.87109375, "grad_norm_var": 0.036821937561035155, "learning_rate": 0.0001, "loss": 1.6044, "loss/crossentropy": 2.5139598846435547, "loss/fcd": 1.46875, "loss/idx": 1.5, "loss/logits": 0.13561376184225082, "step": 282 }, { "epoch": 0.00422580428403975, "grad_norm": 0.84375, "grad_norm_var": 0.03695468902587891, "learning_rate": 0.0001, "loss": 1.8361, "loss/crossentropy": 2.2974244356155396, "loss/fcd": 1.66015625, "loss/idx": 1.5, "loss/logits": 0.17598910629749298, "step": 283 }, { "epoch": 0.004240736454654731, "grad_norm": 1.34375, "grad_norm_var": 0.04544213612874349, "learning_rate": 0.0001, "loss": 1.9345, "loss/crossentropy": 2.6373846530914307, "loss/fcd": 1.72265625, "loss/idx": 1.5, "loss/logits": 0.21188458800315857, "step": 284 }, { "epoch": 0.0042556686252697125, "grad_norm": 0.91796875, "grad_norm_var": 0.04480584462483724, "learning_rate": 0.0001, "loss": 1.6757, "loss/crossentropy": 2.4831149578094482, "loss/fcd": 1.515625, "loss/idx": 1.5, "loss/logits": 0.16006582230329514, "step": 285 }, { "epoch": 0.0042706007958846935, "grad_norm": 0.97265625, "grad_norm_var": 0.04436620076497396, "learning_rate": 0.0001, "loss": 1.7398, "loss/crossentropy": 2.4333916902542114, "loss/fcd": 1.578125, "loss/idx": 1.5, "loss/logits": 0.16168130189180374, "step": 286 }, { "epoch": 0.004285532966499675, "grad_norm": 0.82421875, "grad_norm_var": 0.04506626129150391, "learning_rate": 0.0001, "loss": 1.643, "loss/crossentropy": 2.5061216354370117, "loss/fcd": 1.48828125, "loss/idx": 1.5, "loss/logits": 0.1546824723482132, "step": 287 }, { "epoch": 0.004300465137114656, "grad_norm": 0.8671875, "grad_norm_var": 0.0459014892578125, "learning_rate": 0.0001, "loss": 1.6412, "loss/crossentropy": 2.4621150493621826, "loss/fcd": 1.484375, "loss/idx": 1.5, "loss/logits": 0.1568301096558571, "step": 288 }, { "epoch": 0.004315397307729638, "grad_norm": 0.80078125, "grad_norm_var": 0.047817420959472653, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.621973991394043, "loss/fcd": 1.40625, "loss/idx": 1.5, "loss/logits": 0.12922291457653046, "step": 289 }, { "epoch": 0.00433032947834462, "grad_norm": 0.8203125, "grad_norm_var": 0.04879124959309896, "learning_rate": 0.0001, "loss": 1.6732, "loss/crossentropy": 2.392053723335266, "loss/fcd": 1.51953125, "loss/idx": 1.5, "loss/logits": 0.15370090305805206, "step": 290 }, { "epoch": 0.004345261648959601, "grad_norm": 0.92578125, "grad_norm_var": 0.01599267323811849, "learning_rate": 0.0001, "loss": 1.7259, "loss/crossentropy": 2.4002796411514282, "loss/fcd": 1.546875, "loss/idx": 1.5, "loss/logits": 0.17903384566307068, "step": 291 }, { "epoch": 0.004360193819574583, "grad_norm": 0.87890625, "grad_norm_var": 0.01599725087483724, "learning_rate": 0.0001, "loss": 1.6282, "loss/crossentropy": 2.4668020009994507, "loss/fcd": 1.48046875, "loss/idx": 1.5, "loss/logits": 0.1476999893784523, "step": 292 }, { "epoch": 0.004375125990189564, "grad_norm": 0.9609375, "grad_norm_var": 0.01608117421468099, "learning_rate": 0.0001, "loss": 1.9094, "loss/crossentropy": 2.46618115901947, "loss/fcd": 1.70703125, "loss/idx": 1.5, "loss/logits": 0.2024083137512207, "step": 293 }, { "epoch": 0.0043900581608045455, "grad_norm": 0.84375, "grad_norm_var": 0.016165924072265626, "learning_rate": 0.0001, "loss": 1.7257, "loss/crossentropy": 2.5607261657714844, "loss/fcd": 1.55859375, "loss/idx": 1.5, "loss/logits": 0.16708557307720184, "step": 294 }, { "epoch": 0.0044049903314195265, "grad_norm": 0.82421875, "grad_norm_var": 0.016562652587890626, "learning_rate": 0.0001, "loss": 1.6713, "loss/crossentropy": 2.642008900642395, "loss/fcd": 1.51171875, "loss/idx": 1.5, "loss/logits": 0.15961749851703644, "step": 295 }, { "epoch": 0.004419922502034508, "grad_norm": 0.8203125, "grad_norm_var": 0.016904449462890624, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.7051438093185425, "loss/fcd": 1.4609375, "loss/idx": 1.5, "loss/logits": 0.143184632062912, "step": 296 }, { "epoch": 0.004434854672649489, "grad_norm": 0.84765625, "grad_norm_var": 0.016817665100097655, "learning_rate": 0.0001, "loss": 1.6298, "loss/crossentropy": 2.6207375526428223, "loss/fcd": 1.4765625, "loss/idx": 1.5, "loss/logits": 0.15321487188339233, "step": 297 }, { "epoch": 0.004449786843264471, "grad_norm": 0.93359375, "grad_norm_var": 0.016840044657389322, "learning_rate": 0.0001, "loss": 1.8072, "loss/crossentropy": 2.586044192314148, "loss/fcd": 1.62109375, "loss/idx": 1.5, "loss/logits": 0.18614596128463745, "step": 298 }, { "epoch": 0.004464719013879453, "grad_norm": 0.8203125, "grad_norm_var": 0.017055193583170574, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.5060659646987915, "loss/fcd": 1.5234375, "loss/idx": 1.5, "loss/logits": 0.16112150996923447, "step": 299 }, { "epoch": 0.004479651184494434, "grad_norm": 0.76171875, "grad_norm_var": 0.0038022359212239584, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.5580883026123047, "loss/fcd": 1.421875, "loss/idx": 1.5, "loss/logits": 0.14667311310768127, "step": 300 }, { "epoch": 0.004494583355109416, "grad_norm": 0.83203125, "grad_norm_var": 0.003642781575520833, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6943860054016113, "loss/fcd": 1.48828125, "loss/idx": 1.5, "loss/logits": 0.15546557307243347, "step": 301 }, { "epoch": 0.004509515525724397, "grad_norm": 0.8125, "grad_norm_var": 0.0028060277303059897, "learning_rate": 0.0001, "loss": 1.6621, "loss/crossentropy": 2.6080933809280396, "loss/fcd": 1.5078125, "loss/idx": 1.5, "loss/logits": 0.15433663129806519, "step": 302 }, { "epoch": 0.0045244476963393785, "grad_norm": 1.0, "grad_norm_var": 0.004170735677083333, "learning_rate": 0.0001, "loss": 1.7273, "loss/crossentropy": 2.4456557035446167, "loss/fcd": 1.5625, "loss/idx": 1.5, "loss/logits": 0.16477762162685394, "step": 303 }, { "epoch": 0.0045393798669543595, "grad_norm": 1.78125, "grad_norm_var": 0.057342274983723955, "learning_rate": 0.0001, "loss": 1.7906, "loss/crossentropy": 2.5243054628372192, "loss/fcd": 1.6171875, "loss/idx": 2.0, "loss/logits": 0.17336448282003403, "step": 304 }, { "epoch": 0.004554312037569341, "grad_norm": 2.78125, "grad_norm_var": 0.27192529042561847, "learning_rate": 0.0001, "loss": 2.4082, "loss/crossentropy": 2.0944008827209473, "loss/fcd": 2.1328125, "loss/idx": 2.0, "loss/logits": 0.27535852044820786, "step": 305 }, { "epoch": 0.004569244208184322, "grad_norm": 2.234375, "grad_norm_var": 0.35542494455973306, "learning_rate": 0.0001, "loss": 2.2988, "loss/crossentropy": 2.386541962623596, "loss/fcd": 2.046875, "loss/idx": 2.0, "loss/logits": 0.25191500037908554, "step": 306 }, { "epoch": 0.004584176378799304, "grad_norm": 1.7109375, "grad_norm_var": 0.37271525065104166, "learning_rate": 0.0001, "loss": 2.0799, "loss/crossentropy": 2.5651217699050903, "loss/fcd": 1.875, "loss/idx": 2.0, "loss/logits": 0.2048977091908455, "step": 307 }, { "epoch": 0.004599108549414286, "grad_norm": 1.3984375, "grad_norm_var": 0.36888472239176434, "learning_rate": 0.0001, "loss": 1.888, "loss/crossentropy": 2.3940224647521973, "loss/fcd": 1.70703125, "loss/idx": 2.0, "loss/logits": 0.18101809173822403, "step": 308 }, { "epoch": 0.004614040720029267, "grad_norm": 1.4921875, "grad_norm_var": 0.3688674290974935, "learning_rate": 0.0001, "loss": 1.8624, "loss/crossentropy": 2.7549002170562744, "loss/fcd": 1.6875, "loss/idx": 2.0, "loss/logits": 0.17488256096839905, "step": 309 }, { "epoch": 0.004628972890644249, "grad_norm": 1.5078125, "grad_norm_var": 0.3610422134399414, "learning_rate": 0.0001, "loss": 1.9112, "loss/crossentropy": 2.855328917503357, "loss/fcd": 1.734375, "loss/idx": 2.0, "loss/logits": 0.17682038247585297, "step": 310 }, { "epoch": 0.00464390506125923, "grad_norm": 1.28125, "grad_norm_var": 0.3460235595703125, "learning_rate": 0.0001, "loss": 1.8355, "loss/crossentropy": 2.4556884765625, "loss/fcd": 1.67578125, "loss/idx": 2.0, "loss/logits": 0.15974701941013336, "step": 311 }, { "epoch": 0.0046588372318742115, "grad_norm": 1.4296875, "grad_norm_var": 0.32916259765625, "learning_rate": 0.0001, "loss": 2.0998, "loss/crossentropy": 2.13610577583313, "loss/fcd": 1.8984375, "loss/idx": 2.0, "loss/logits": 0.20135962963104248, "step": 312 }, { "epoch": 0.0046737694024891925, "grad_norm": 1.1796875, "grad_norm_var": 0.31374454498291016, "learning_rate": 0.0001, "loss": 1.7831, "loss/crossentropy": 2.529542326927185, "loss/fcd": 1.62890625, "loss/idx": 2.0, "loss/logits": 0.15422120690345764, "step": 313 }, { "epoch": 0.004688701573104174, "grad_norm": 1.3359375, "grad_norm_var": 0.3003265380859375, "learning_rate": 0.0001, "loss": 1.9462, "loss/crossentropy": 2.286463499069214, "loss/fcd": 1.765625, "loss/idx": 2.0, "loss/logits": 0.18060050159692764, "step": 314 }, { "epoch": 0.004703633743719155, "grad_norm": 1.390625, "grad_norm_var": 0.2767677307128906, "learning_rate": 0.0001, "loss": 1.931, "loss/crossentropy": 2.2394256591796875, "loss/fcd": 1.74609375, "loss/idx": 2.0, "loss/logits": 0.1849333867430687, "step": 315 }, { "epoch": 0.004718565914334137, "grad_norm": 1.3671875, "grad_norm_var": 0.24547926584879556, "learning_rate": 0.0001, "loss": 1.9587, "loss/crossentropy": 2.5843063592910767, "loss/fcd": 1.7734375, "loss/idx": 2.0, "loss/logits": 0.18526208400726318, "step": 316 }, { "epoch": 0.004733498084949119, "grad_norm": 1.359375, "grad_norm_var": 0.21793619791666666, "learning_rate": 0.0001, "loss": 1.9264, "loss/crossentropy": 2.453084349632263, "loss/fcd": 1.73046875, "loss/idx": 2.0, "loss/logits": 0.1959688439965248, "step": 317 }, { "epoch": 0.0047484302555641, "grad_norm": 1.2734375, "grad_norm_var": 0.1887224833170573, "learning_rate": 0.0001, "loss": 1.9691, "loss/crossentropy": 2.449620246887207, "loss/fcd": 1.77734375, "loss/idx": 2.0, "loss/logits": 0.19172405451536179, "step": 318 }, { "epoch": 0.004763362426179082, "grad_norm": 1.1796875, "grad_norm_var": 0.17797749837239582, "learning_rate": 0.0001, "loss": 1.8097, "loss/crossentropy": 2.6453970670700073, "loss/fcd": 1.63671875, "loss/idx": 2.0, "loss/logits": 0.17299381643533707, "step": 319 }, { "epoch": 0.004778294596794063, "grad_norm": 1.2578125, "grad_norm_var": 0.17853978474934895, "learning_rate": 0.0001, "loss": 1.9285, "loss/crossentropy": 2.4863957166671753, "loss/fcd": 1.73828125, "loss/idx": 2.0, "loss/logits": 0.19023225456476212, "step": 320 }, { "epoch": 0.0047932267674090446, "grad_norm": 1.0234375, "grad_norm_var": 0.07399800618489584, "learning_rate": 0.0001, "loss": 1.7707, "loss/crossentropy": 2.5647225379943848, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16133707761764526, "step": 321 }, { "epoch": 0.0048081589380240255, "grad_norm": 1.1875, "grad_norm_var": 0.026220703125, "learning_rate": 0.0001, "loss": 1.8213, "loss/crossentropy": 2.834408164024353, "loss/fcd": 1.65625, "loss/idx": 2.0, "loss/logits": 0.165000282227993, "step": 322 }, { "epoch": 0.004823091108639007, "grad_norm": 1.171875, "grad_norm_var": 0.017429351806640625, "learning_rate": 0.0001, "loss": 1.8912, "loss/crossentropy": 2.6110740900039673, "loss/fcd": 1.7109375, "loss/idx": 2.0, "loss/logits": 0.18023446202278137, "step": 323 }, { "epoch": 0.004838023279253989, "grad_norm": 1.0078125, "grad_norm_var": 0.021956125895182293, "learning_rate": 0.0001, "loss": 1.7027, "loss/crossentropy": 2.4298505783081055, "loss/fcd": 1.55859375, "loss/idx": 2.0, "loss/logits": 0.1441233903169632, "step": 324 }, { "epoch": 0.00485295544986897, "grad_norm": 1.4375, "grad_norm_var": 0.020580037434895834, "learning_rate": 0.0001, "loss": 1.9852, "loss/crossentropy": 2.5150561332702637, "loss/fcd": 1.73046875, "loss/idx": 2.0, "loss/logits": 0.25472788512706757, "step": 325 }, { "epoch": 0.004867887620483952, "grad_norm": 1.3046875, "grad_norm_var": 0.016837565104166667, "learning_rate": 0.0001, "loss": 1.7973, "loss/crossentropy": 2.7767333984375, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17234515398740768, "step": 326 }, { "epoch": 0.004882819791098933, "grad_norm": 1.3671875, "grad_norm_var": 0.01752293904622396, "learning_rate": 0.0001, "loss": 2.1812, "loss/crossentropy": 2.416645646095276, "loss/fcd": 1.96875, "loss/idx": 2.0, "loss/logits": 0.21248316764831543, "step": 327 }, { "epoch": 0.004897751961713915, "grad_norm": 1.046875, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 1.9089, "loss/crossentropy": 2.5154889822006226, "loss/fcd": 1.71875, "loss/idx": 2.0, "loss/logits": 0.19012955576181412, "step": 328 }, { "epoch": 0.004912684132328896, "grad_norm": 1.1484375, "grad_norm_var": 0.018708292643229166, "learning_rate": 0.0001, "loss": 1.8608, "loss/crossentropy": 2.5697543621063232, "loss/fcd": 1.6796875, "loss/idx": 2.0, "loss/logits": 0.18107804656028748, "step": 329 }, { "epoch": 0.0049276163029438776, "grad_norm": 1.0546875, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 1.8524, "loss/crossentropy": 2.3531534671783447, "loss/fcd": 1.6796875, "loss/idx": 2.0, "loss/logits": 0.1726970225572586, "step": 330 }, { "epoch": 0.0049425484735588585, "grad_norm": 1.015625, "grad_norm_var": 0.020539347330729166, "learning_rate": 0.0001, "loss": 1.6318, "loss/crossentropy": 2.558312177658081, "loss/fcd": 1.4921875, "loss/idx": 2.0, "loss/logits": 0.1396355852484703, "step": 331 }, { "epoch": 0.00495748064417384, "grad_norm": 1.0625, "grad_norm_var": 0.019557444254557292, "learning_rate": 0.0001, "loss": 1.8684, "loss/crossentropy": 2.4718196392059326, "loss/fcd": 1.6875, "loss/idx": 2.0, "loss/logits": 0.18088336288928986, "step": 332 }, { "epoch": 0.004972412814788822, "grad_norm": 1.125, "grad_norm_var": 0.017421213785807292, "learning_rate": 0.0001, "loss": 1.7542, "loss/crossentropy": 2.646172881126404, "loss/fcd": 1.6015625, "loss/idx": 2.0, "loss/logits": 0.15264033526182175, "step": 333 }, { "epoch": 0.004987344985403803, "grad_norm": 0.98828125, "grad_norm_var": 0.018437639872233073, "learning_rate": 0.0001, "loss": 1.8037, "loss/crossentropy": 2.658991813659668, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17869339883327484, "step": 334 }, { "epoch": 0.005002277156018785, "grad_norm": 1.3359375, "grad_norm_var": 0.020609474182128905, "learning_rate": 0.0001, "loss": 2.0561, "loss/crossentropy": 2.62394380569458, "loss/fcd": 1.84765625, "loss/idx": 2.0, "loss/logits": 0.20848941057920456, "step": 335 }, { "epoch": 0.005017209326633766, "grad_norm": 1.09375, "grad_norm_var": 0.020118141174316408, "learning_rate": 0.0001, "loss": 1.7623, "loss/crossentropy": 2.430611491203308, "loss/fcd": 1.59375, "loss/idx": 2.0, "loss/logits": 0.1685967594385147, "step": 336 }, { "epoch": 0.005032141497248748, "grad_norm": 1.375, "grad_norm_var": 0.021994972229003908, "learning_rate": 0.0001, "loss": 1.8267, "loss/crossentropy": 2.6477928161621094, "loss/fcd": 1.65625, "loss/idx": 2.0, "loss/logits": 0.1704726666212082, "step": 337 }, { "epoch": 0.005047073667863729, "grad_norm": 1.1796875, "grad_norm_var": 0.021980730692545573, "learning_rate": 0.0001, "loss": 1.8154, "loss/crossentropy": 2.4817826747894287, "loss/fcd": 1.65234375, "loss/idx": 2.0, "loss/logits": 0.16309074312448502, "step": 338 }, { "epoch": 0.005062005838478711, "grad_norm": 1.4453125, "grad_norm_var": 0.026733843485514323, "learning_rate": 0.0001, "loss": 2.2624, "loss/crossentropy": 2.9061405658721924, "loss/fcd": 2.02734375, "loss/idx": 2.0, "loss/logits": 0.2350568175315857, "step": 339 }, { "epoch": 0.0050769380090936915, "grad_norm": 1.0859375, "grad_norm_var": 0.025251197814941406, "learning_rate": 0.0001, "loss": 1.7576, "loss/crossentropy": 2.5378034114837646, "loss/fcd": 1.5859375, "loss/idx": 2.0, "loss/logits": 0.17163680493831635, "step": 340 }, { "epoch": 0.005091870179708673, "grad_norm": 1.109375, "grad_norm_var": 0.021224403381347658, "learning_rate": 0.0001, "loss": 1.7415, "loss/crossentropy": 2.383753538131714, "loss/fcd": 1.578125, "loss/idx": 2.0, "loss/logits": 0.16342193633317947, "step": 341 }, { "epoch": 0.005106802350323655, "grad_norm": 1.1640625, "grad_norm_var": 0.019956398010253906, "learning_rate": 0.0001, "loss": 1.707, "loss/crossentropy": 2.3804928064346313, "loss/fcd": 1.546875, "loss/idx": 2.0, "loss/logits": 0.1601005494594574, "step": 342 }, { "epoch": 0.005121734520938636, "grad_norm": 1.0546875, "grad_norm_var": 0.017525164286295573, "learning_rate": 0.0001, "loss": 1.7786, "loss/crossentropy": 2.5734684467315674, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16918828338384628, "step": 343 }, { "epoch": 0.005136666691553618, "grad_norm": 1.34375, "grad_norm_var": 0.019235674540201822, "learning_rate": 0.0001, "loss": 1.8536, "loss/crossentropy": 2.6887491941452026, "loss/fcd": 1.6640625, "loss/idx": 2.0, "loss/logits": 0.18957675993442535, "step": 344 }, { "epoch": 0.005151598862168599, "grad_norm": 0.99609375, "grad_norm_var": 0.020949045817057293, "learning_rate": 0.0001, "loss": 1.7255, "loss/crossentropy": 2.3476874828338623, "loss/fcd": 1.5859375, "loss/idx": 2.0, "loss/logits": 0.1395951807498932, "step": 345 }, { "epoch": 0.005166531032783581, "grad_norm": 0.9609375, "grad_norm_var": 0.02271296183268229, "learning_rate": 0.0001, "loss": 1.7556, "loss/crossentropy": 2.597803831100464, "loss/fcd": 1.59765625, "loss/idx": 2.0, "loss/logits": 0.15798340737819672, "step": 346 }, { "epoch": 0.005181463203398562, "grad_norm": 1.7421875, "grad_norm_var": 0.043076578776041666, "learning_rate": 0.0001, "loss": 1.8556, "loss/crossentropy": 2.5390493869781494, "loss/fcd": 1.67578125, "loss/idx": 2.0, "loss/logits": 0.17979220300912857, "step": 347 }, { "epoch": 0.005196395374013544, "grad_norm": 1.25, "grad_norm_var": 0.04205118815104167, "learning_rate": 0.0001, "loss": 1.7972, "loss/crossentropy": 2.640982985496521, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.1721990555524826, "step": 348 }, { "epoch": 0.0052113275446285245, "grad_norm": 1.3671875, "grad_norm_var": 0.04319432576497396, "learning_rate": 0.0001, "loss": 1.8083, "loss/crossentropy": 2.61712646484375, "loss/fcd": 1.64453125, "loss/idx": 2.0, "loss/logits": 0.16379254311323166, "step": 349 }, { "epoch": 0.005226259715243506, "grad_norm": 1.1640625, "grad_norm_var": 0.03973534901936849, "learning_rate": 0.0001, "loss": 1.7109, "loss/crossentropy": 2.527679681777954, "loss/fcd": 1.55859375, "loss/idx": 2.0, "loss/logits": 0.1522795334458351, "step": 350 }, { "epoch": 0.005241191885858488, "grad_norm": 1.140625, "grad_norm_var": 0.039341163635253903, "learning_rate": 0.0001, "loss": 1.7953, "loss/crossentropy": 2.591903328895569, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17032313346862793, "step": 351 }, { "epoch": 0.005256124056473469, "grad_norm": 1.1953125, "grad_norm_var": 0.03831628163655599, "learning_rate": 0.0001, "loss": 1.7707, "loss/crossentropy": 2.668965458869934, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16133900731801987, "step": 352 }, { "epoch": 0.005271056227088451, "grad_norm": 1.0390625, "grad_norm_var": 0.038578732808430986, "learning_rate": 0.0001, "loss": 1.7464, "loss/crossentropy": 2.514343738555908, "loss/fcd": 1.58984375, "loss/idx": 2.0, "loss/logits": 0.1565106362104416, "step": 353 }, { "epoch": 0.005285988397703432, "grad_norm": 1.1171875, "grad_norm_var": 0.03901208241780599, "learning_rate": 0.0001, "loss": 1.8161, "loss/crossentropy": 2.726048231124878, "loss/fcd": 1.6171875, "loss/idx": 2.0, "loss/logits": 0.1989368051290512, "step": 354 }, { "epoch": 0.005300920568318414, "grad_norm": 1.125, "grad_norm_var": 0.03488305409749349, "learning_rate": 0.0001, "loss": 1.9807, "loss/crossentropy": 2.365426182746887, "loss/fcd": 1.78515625, "loss/idx": 2.0, "loss/logits": 0.1955546736717224, "step": 355 }, { "epoch": 0.005315852738933395, "grad_norm": 1.09375, "grad_norm_var": 0.03479048411051432, "learning_rate": 0.0001, "loss": 1.725, "loss/crossentropy": 2.2926045656204224, "loss/fcd": 1.578125, "loss/idx": 2.0, "loss/logits": 0.14692262932658195, "step": 356 }, { "epoch": 0.005330784909548377, "grad_norm": 0.98828125, "grad_norm_var": 0.03683039347330729, "learning_rate": 0.0001, "loss": 1.796, "loss/crossentropy": 2.489909529685974, "loss/fcd": 1.62109375, "loss/idx": 2.0, "loss/logits": 0.17487448453903198, "step": 357 }, { "epoch": 0.0053457170801633576, "grad_norm": 0.62109375, "grad_norm_var": 0.05578657786051432, "learning_rate": 0.0001, "loss": 1.7193, "loss/crossentropy": 2.467299699783325, "loss/fcd": 1.52734375, "loss/idx": 2.5, "loss/logits": 0.19200193881988525, "step": 358 }, { "epoch": 0.005360649250778339, "grad_norm": 0.6328125, "grad_norm_var": 0.07156569163004557, "learning_rate": 0.0001, "loss": 1.586, "loss/crossentropy": 2.6709818840026855, "loss/fcd": 1.41796875, "loss/idx": 2.5, "loss/logits": 0.16798634827136993, "step": 359 }, { "epoch": 0.005375581421393321, "grad_norm": 0.63671875, "grad_norm_var": 0.08087539672851562, "learning_rate": 0.0001, "loss": 1.6455, "loss/crossentropy": 2.360570549964905, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.1884693130850792, "step": 360 }, { "epoch": 0.005390513592008302, "grad_norm": 0.69140625, "grad_norm_var": 0.0895538330078125, "learning_rate": 0.0001, "loss": 1.6262, "loss/crossentropy": 2.6428942680358887, "loss/fcd": 1.453125, "loss/idx": 2.5, "loss/logits": 0.173103965818882, "step": 361 }, { "epoch": 0.005405445762623284, "grad_norm": 0.59375, "grad_norm_var": 0.1022356669108073, "learning_rate": 0.0001, "loss": 1.4134, "loss/crossentropy": 2.5077950954437256, "loss/fcd": 1.28125, "loss/idx": 2.5, "loss/logits": 0.13219071924686432, "step": 362 }, { "epoch": 0.005420377933238265, "grad_norm": 0.625, "grad_norm_var": 0.07339680989583333, "learning_rate": 0.0001, "loss": 1.543, "loss/crossentropy": 2.4522061347961426, "loss/fcd": 1.390625, "loss/idx": 2.5, "loss/logits": 0.15238897874951363, "step": 363 }, { "epoch": 0.005435310103853247, "grad_norm": 0.7578125, "grad_norm_var": 0.06918309529622396, "learning_rate": 0.0001, "loss": 1.7256, "loss/crossentropy": 2.442309260368347, "loss/fcd": 1.51953125, "loss/idx": 2.5, "loss/logits": 0.20608609169721603, "step": 364 }, { "epoch": 0.005450242274468228, "grad_norm": 0.5859375, "grad_norm_var": 0.061197662353515626, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.4856804609298706, "loss/fcd": 1.35546875, "loss/idx": 2.5, "loss/logits": 0.14786505699157715, "step": 365 }, { "epoch": 0.00546517444508321, "grad_norm": 0.59765625, "grad_norm_var": 0.05945529937744141, "learning_rate": 0.0001, "loss": 1.5867, "loss/crossentropy": 2.6083357334136963, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.1570557877421379, "step": 366 }, { "epoch": 0.0054801066156981906, "grad_norm": 0.6953125, "grad_norm_var": 0.054004859924316403, "learning_rate": 0.0001, "loss": 1.6414, "loss/crossentropy": 2.6212077140808105, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.18436457216739655, "step": 367 }, { "epoch": 0.005495038786313172, "grad_norm": 0.63671875, "grad_norm_var": 0.044976806640625, "learning_rate": 0.0001, "loss": 1.6194, "loss/crossentropy": 2.5251601934432983, "loss/fcd": 1.4453125, "loss/idx": 2.5, "loss/logits": 0.17409738898277283, "step": 368 }, { "epoch": 0.005509970956928154, "grad_norm": 0.6640625, "grad_norm_var": 0.040679931640625, "learning_rate": 0.0001, "loss": 1.609, "loss/crossentropy": 2.6006916761398315, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.17928281426429749, "step": 369 }, { "epoch": 0.005524903127543135, "grad_norm": 0.6328125, "grad_norm_var": 0.0318817138671875, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.6712244749069214, "loss/fcd": 1.5234375, "loss/idx": 2.5, "loss/logits": 0.19155558198690414, "step": 370 }, { "epoch": 0.005539835298158117, "grad_norm": 0.8125, "grad_norm_var": 0.0212615966796875, "learning_rate": 0.0001, "loss": 1.6621, "loss/crossentropy": 2.5613213777542114, "loss/fcd": 1.5, "loss/idx": 2.5, "loss/logits": 0.16209547966718674, "step": 371 }, { "epoch": 0.005554767468773098, "grad_norm": 0.65625, "grad_norm_var": 0.0104949951171875, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.641627550125122, "loss/fcd": 1.23828125, "loss/idx": 2.5, "loss/logits": 0.11289479583501816, "step": 372 }, { "epoch": 0.00556969963938808, "grad_norm": 0.6171875, "grad_norm_var": 0.0036879857381184895, "learning_rate": 0.0001, "loss": 1.5787, "loss/crossentropy": 2.66814649105072, "loss/fcd": 1.421875, "loss/idx": 2.5, "loss/logits": 0.15683035552501678, "step": 373 }, { "epoch": 0.005584631810003061, "grad_norm": 0.6484375, "grad_norm_var": 0.0036163330078125, "learning_rate": 0.0001, "loss": 1.6049, "loss/crossentropy": 2.6782963275909424, "loss/fcd": 1.4453125, "loss/idx": 2.5, "loss/logits": 0.1595475971698761, "step": 374 }, { "epoch": 0.005599563980618043, "grad_norm": 0.66015625, "grad_norm_var": 0.0035811742146809895, "learning_rate": 0.0001, "loss": 1.7587, "loss/crossentropy": 2.5838359594345093, "loss/fcd": 1.546875, "loss/idx": 2.5, "loss/logits": 0.2117815613746643, "step": 375 }, { "epoch": 0.005614496151233024, "grad_norm": 0.70703125, "grad_norm_var": 0.0037001927693684896, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.2126421332359314, "loss/fcd": 1.48828125, "loss/idx": 2.5, "loss/logits": 0.16539722681045532, "step": 376 }, { "epoch": 0.005629428321848005, "grad_norm": 0.6484375, "grad_norm_var": 0.0036435445149739583, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.60499370098114, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.1684800684452057, "step": 377 }, { "epoch": 0.005644360492462987, "grad_norm": 0.6953125, "grad_norm_var": 0.0034088134765625, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.749386191368103, "loss/fcd": 1.37109375, "loss/idx": 2.5, "loss/logits": 0.14144720882177353, "step": 378 }, { "epoch": 0.005659292663077968, "grad_norm": 0.671875, "grad_norm_var": 0.0032958984375, "learning_rate": 0.0001, "loss": 1.5179, "loss/crossentropy": 2.5258721113204956, "loss/fcd": 1.375, "loss/idx": 2.5, "loss/logits": 0.14287371188402176, "step": 379 }, { "epoch": 0.00567422483369295, "grad_norm": 0.828125, "grad_norm_var": 0.004447174072265625, "learning_rate": 0.0001, "loss": 1.5972, "loss/crossentropy": 2.4670597314834595, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.1674765683710575, "step": 380 }, { "epoch": 0.005689157004307931, "grad_norm": 0.75, "grad_norm_var": 0.0042388916015625, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.41486656665802, "loss/fcd": 1.5625, "loss/idx": 2.5, "loss/logits": 0.1684126779437065, "step": 381 }, { "epoch": 0.005704089174922913, "grad_norm": 1.1015625, "grad_norm_var": 0.014400672912597657, "learning_rate": 0.0001, "loss": 1.8496, "loss/crossentropy": 2.5699658393859863, "loss/fcd": 1.62890625, "loss/idx": 2.5, "loss/logits": 0.22069481760263443, "step": 382 }, { "epoch": 0.005719021345537894, "grad_norm": 0.61328125, "grad_norm_var": 0.01502685546875, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.6866044998168945, "loss/fcd": 1.2890625, "loss/idx": 2.5, "loss/logits": 0.1360424980521202, "step": 383 }, { "epoch": 0.005733953516152876, "grad_norm": 0.671875, "grad_norm_var": 0.014765357971191407, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.7703367471694946, "loss/fcd": 1.40625, "loss/idx": 2.5, "loss/logits": 0.15454240143299103, "step": 384 }, { "epoch": 0.005748885686767857, "grad_norm": 0.71484375, "grad_norm_var": 0.014607493082682292, "learning_rate": 0.0001, "loss": 1.6469, "loss/crossentropy": 2.5732953548431396, "loss/fcd": 1.4765625, "loss/idx": 2.5, "loss/logits": 0.17037127912044525, "step": 385 }, { "epoch": 0.005763817857382838, "grad_norm": 0.69140625, "grad_norm_var": 0.014185015360514324, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.9296597242355347, "loss/fcd": 1.3984375, "loss/idx": 2.5, "loss/logits": 0.14904215186834335, "step": 386 }, { "epoch": 0.00577875002799782, "grad_norm": 0.6015625, "grad_norm_var": 0.014308611551920572, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.6140079498291016, "loss/fcd": 1.42578125, "loss/idx": 2.5, "loss/logits": 0.16666851192712784, "step": 387 }, { "epoch": 0.005793682198612801, "grad_norm": 0.9296875, "grad_norm_var": 0.01721032460530599, "learning_rate": 0.0001, "loss": 1.9019, "loss/crossentropy": 2.599937319755554, "loss/fcd": 1.6640625, "loss/idx": 2.5, "loss/logits": 0.2378673255443573, "step": 388 }, { "epoch": 0.005808614369227783, "grad_norm": 0.625, "grad_norm_var": 0.01710503896077474, "learning_rate": 0.0001, "loss": 1.5824, "loss/crossentropy": 2.548129081726074, "loss/fcd": 1.42578125, "loss/idx": 2.5, "loss/logits": 0.1565687358379364, "step": 389 }, { "epoch": 0.005823546539842764, "grad_norm": 0.59375, "grad_norm_var": 0.01783135732014974, "learning_rate": 0.0001, "loss": 1.6191, "loss/crossentropy": 2.7168357372283936, "loss/fcd": 1.44140625, "loss/idx": 2.5, "loss/logits": 0.17769953608512878, "step": 390 }, { "epoch": 0.005838478710457746, "grad_norm": 0.6328125, "grad_norm_var": 0.018092600504557292, "learning_rate": 0.0001, "loss": 1.6704, "loss/crossentropy": 2.4698110818862915, "loss/fcd": 1.48046875, "loss/idx": 2.5, "loss/logits": 0.189944326877594, "step": 391 }, { "epoch": 0.005853410881072727, "grad_norm": 0.65625, "grad_norm_var": 0.018323198954264323, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.594900608062744, "loss/fcd": 1.3828125, "loss/idx": 2.5, "loss/logits": 0.15086808055639267, "step": 392 }, { "epoch": 0.005868343051687709, "grad_norm": 0.6484375, "grad_norm_var": 0.018323198954264323, "learning_rate": 0.0001, "loss": 1.6741, "loss/crossentropy": 2.6552109718322754, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.17803405970335007, "step": 393 }, { "epoch": 0.0058832752223026905, "grad_norm": 0.7578125, "grad_norm_var": 0.018410682678222656, "learning_rate": 0.0001, "loss": 1.7245, "loss/crossentropy": 2.6911847591400146, "loss/fcd": 1.53515625, "loss/idx": 2.5, "loss/logits": 0.18936707079410553, "step": 394 }, { "epoch": 0.005898207392917671, "grad_norm": 0.6328125, "grad_norm_var": 0.018746376037597656, "learning_rate": 0.0001, "loss": 1.6557, "loss/crossentropy": 2.44334614276886, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.15957393497228622, "step": 395 }, { "epoch": 0.005913139563532653, "grad_norm": 0.58203125, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 1.5261, "loss/crossentropy": 2.562255382537842, "loss/fcd": 1.37890625, "loss/idx": 2.5, "loss/logits": 0.14717654883861542, "step": 396 }, { "epoch": 0.005928071734147634, "grad_norm": 0.76953125, "grad_norm_var": 0.018992042541503905, "learning_rate": 0.0001, "loss": 1.676, "loss/crossentropy": 2.5732717514038086, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.17988938093185425, "step": 397 }, { "epoch": 0.005943003904762616, "grad_norm": 1.6171875, "grad_norm_var": 0.06311893463134766, "learning_rate": 0.0001, "loss": 1.7434, "loss/crossentropy": 2.4858436584472656, "loss/fcd": 1.57421875, "loss/idx": 3.0, "loss/logits": 0.16918716579675674, "step": 398 }, { "epoch": 0.005957936075377597, "grad_norm": 2.65625, "grad_norm_var": 0.2911905924479167, "learning_rate": 0.0001, "loss": 2.05, "loss/crossentropy": 2.784856915473938, "loss/fcd": 1.83984375, "loss/idx": 3.0, "loss/logits": 0.21015368402004242, "step": 399 }, { "epoch": 0.005972868245992579, "grad_norm": 2.546875, "grad_norm_var": 0.4635538736979167, "learning_rate": 0.0001, "loss": 2.0473, "loss/crossentropy": 2.734723210334778, "loss/fcd": 1.84765625, "loss/idx": 3.0, "loss/logits": 0.19964434951543808, "step": 400 }, { "epoch": 0.00598780041660756, "grad_norm": 1.78125, "grad_norm_var": 0.4971394220987956, "learning_rate": 0.0001, "loss": 2.0588, "loss/crossentropy": 2.341898798942566, "loss/fcd": 1.86328125, "loss/idx": 3.0, "loss/logits": 0.1955610066652298, "step": 401 }, { "epoch": 0.006002732587222542, "grad_norm": 1.8046875, "grad_norm_var": 0.5220904032389323, "learning_rate": 0.0001, "loss": 2.0501, "loss/crossentropy": 2.6248419284820557, "loss/fcd": 1.828125, "loss/idx": 3.0, "loss/logits": 0.22197691351175308, "step": 402 }, { "epoch": 0.0060176647578375235, "grad_norm": 1.46875, "grad_norm_var": 0.5097544352213542, "learning_rate": 0.0001, "loss": 2.0025, "loss/crossentropy": 2.783551812171936, "loss/fcd": 1.79296875, "loss/idx": 3.0, "loss/logits": 0.2095436304807663, "step": 403 }, { "epoch": 0.006032596928452504, "grad_norm": 1.546875, "grad_norm_var": 0.513873036702474, "learning_rate": 0.0001, "loss": 1.9867, "loss/crossentropy": 2.6072347164154053, "loss/fcd": 1.80078125, "loss/idx": 3.0, "loss/logits": 0.1859557330608368, "step": 404 }, { "epoch": 0.006047529099067486, "grad_norm": 1.3046875, "grad_norm_var": 0.48995564778645834, "learning_rate": 0.0001, "loss": 1.9417, "loss/crossentropy": 2.627634286880493, "loss/fcd": 1.7578125, "loss/idx": 3.0, "loss/logits": 0.18390918523073196, "step": 405 }, { "epoch": 0.006062461269682467, "grad_norm": 2.5625, "grad_norm_var": 0.5599385579427083, "learning_rate": 0.0001, "loss": 2.6781, "loss/crossentropy": 2.818581461906433, "loss/fcd": 2.34765625, "loss/idx": 3.0, "loss/logits": 0.3304808735847473, "step": 406 }, { "epoch": 0.006077393440297449, "grad_norm": 1.5234375, "grad_norm_var": 0.5216115315755209, "learning_rate": 0.0001, "loss": 2.0978, "loss/crossentropy": 2.4685953855514526, "loss/fcd": 1.86328125, "loss/idx": 3.0, "loss/logits": 0.23454776406288147, "step": 407 }, { "epoch": 0.00609232561091243, "grad_norm": 1.3671875, "grad_norm_var": 0.4799781799316406, "learning_rate": 0.0001, "loss": 1.9471, "loss/crossentropy": 2.551527738571167, "loss/fcd": 1.765625, "loss/idx": 3.0, "loss/logits": 0.18149860948324203, "step": 408 }, { "epoch": 0.006107257781527412, "grad_norm": 1.4140625, "grad_norm_var": 0.43242568969726564, "learning_rate": 0.0001, "loss": 1.9248, "loss/crossentropy": 2.543463706970215, "loss/fcd": 1.75390625, "loss/idx": 3.0, "loss/logits": 0.17087971419095993, "step": 409 }, { "epoch": 0.006122189952142393, "grad_norm": 1.3046875, "grad_norm_var": 0.39546890258789064, "learning_rate": 0.0001, "loss": 1.9248, "loss/crossentropy": 2.753381848335266, "loss/fcd": 1.72265625, "loss/idx": 3.0, "loss/logits": 0.20210126042366028, "step": 410 }, { "epoch": 0.006137122122757375, "grad_norm": 1.2734375, "grad_norm_var": 0.342333730061849, "learning_rate": 0.0001, "loss": 1.9107, "loss/crossentropy": 2.6746675968170166, "loss/fcd": 1.71484375, "loss/idx": 3.0, "loss/logits": 0.19586054980754852, "step": 411 }, { "epoch": 0.0061520542933723565, "grad_norm": 1.0859375, "grad_norm_var": 0.29013055165608725, "learning_rate": 0.0001, "loss": 1.7843, "loss/crossentropy": 2.6001042127609253, "loss/fcd": 1.6171875, "loss/idx": 3.0, "loss/logits": 0.16708572953939438, "step": 412 }, { "epoch": 0.0061669864639873374, "grad_norm": 0.99609375, "grad_norm_var": 0.2674448013305664, "learning_rate": 0.0001, "loss": 1.6781, "loss/crossentropy": 2.6679431200027466, "loss/fcd": 1.5234375, "loss/idx": 3.0, "loss/logits": 0.15461371839046478, "step": 413 }, { "epoch": 0.006181918634602319, "grad_norm": 1.6015625, "grad_norm_var": 0.267509396870931, "learning_rate": 0.0001, "loss": 2.2249, "loss/crossentropy": 2.26759135723114, "loss/fcd": 1.953125, "loss/idx": 3.0, "loss/logits": 0.27176591753959656, "step": 414 }, { "epoch": 0.0061968508052173, "grad_norm": 1.1015625, "grad_norm_var": 0.2078927993774414, "learning_rate": 0.0001, "loss": 1.8247, "loss/crossentropy": 2.666002035140991, "loss/fcd": 1.640625, "loss/idx": 3.0, "loss/logits": 0.1840425804257393, "step": 415 }, { "epoch": 0.006211782975832282, "grad_norm": 1.078125, "grad_norm_var": 0.1460733413696289, "learning_rate": 0.0001, "loss": 1.7812, "loss/crossentropy": 2.704661011695862, "loss/fcd": 1.6015625, "loss/idx": 3.0, "loss/logits": 0.17964735627174377, "step": 416 }, { "epoch": 0.006226715146447263, "grad_norm": 1.1328125, "grad_norm_var": 0.1437936782836914, "learning_rate": 0.0001, "loss": 1.7362, "loss/crossentropy": 2.7326667308807373, "loss/fcd": 1.57421875, "loss/idx": 3.0, "loss/logits": 0.16196689009666443, "step": 417 }, { "epoch": 0.006241647317062245, "grad_norm": 1.09375, "grad_norm_var": 0.13800805409749348, "learning_rate": 0.0001, "loss": 1.7713, "loss/crossentropy": 2.713056445121765, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.1775575578212738, "step": 418 }, { "epoch": 0.006256579487677226, "grad_norm": 0.94921875, "grad_norm_var": 0.14775772094726564, "learning_rate": 0.0001, "loss": 1.6976, "loss/crossentropy": 2.5395586490631104, "loss/fcd": 1.546875, "loss/idx": 3.0, "loss/logits": 0.1507459655404091, "step": 419 }, { "epoch": 0.006271511658292208, "grad_norm": 1.015625, "grad_norm_var": 0.15028254191080728, "learning_rate": 0.0001, "loss": 1.9222, "loss/crossentropy": 2.7291557788848877, "loss/fcd": 1.7109375, "loss/idx": 3.0, "loss/logits": 0.21127325296401978, "step": 420 }, { "epoch": 0.0062864438289071895, "grad_norm": 1.203125, "grad_norm_var": 0.15086771647135416, "learning_rate": 0.0001, "loss": 1.8439, "loss/crossentropy": 2.6306716203689575, "loss/fcd": 1.66015625, "loss/idx": 3.0, "loss/logits": 0.18376273661851883, "step": 421 }, { "epoch": 0.0063013759995221704, "grad_norm": 1.0078125, "grad_norm_var": 0.038972727457682294, "learning_rate": 0.0001, "loss": 1.7753, "loss/crossentropy": 2.4223052263259888, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.1815304309129715, "step": 422 }, { "epoch": 0.006316308170137152, "grad_norm": 1.2265625, "grad_norm_var": 0.03155085245768229, "learning_rate": 0.0001, "loss": 2.0225, "loss/crossentropy": 2.6342397928237915, "loss/fcd": 1.7890625, "loss/idx": 3.0, "loss/logits": 0.23343171179294586, "step": 423 }, { "epoch": 0.006331240340752133, "grad_norm": 1.0078125, "grad_norm_var": 0.03056818644205729, "learning_rate": 0.0001, "loss": 1.7615, "loss/crossentropy": 2.7350512742996216, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.16770032793283463, "step": 424 }, { "epoch": 0.006346172511367115, "grad_norm": 1.4453125, "grad_norm_var": 0.031705474853515624, "learning_rate": 0.0001, "loss": 1.7408, "loss/crossentropy": 2.6920334100723267, "loss/fcd": 1.578125, "loss/idx": 3.0, "loss/logits": 0.1627093330025673, "step": 425 }, { "epoch": 0.006361104681982096, "grad_norm": 0.91796875, "grad_norm_var": 0.033474159240722653, "learning_rate": 0.0001, "loss": 1.795, "loss/crossentropy": 2.5012316703796387, "loss/fcd": 1.625, "loss/idx": 3.0, "loss/logits": 0.16997068375349045, "step": 426 }, { "epoch": 0.006376036852597078, "grad_norm": 1.1796875, "grad_norm_var": 0.032274818420410155, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.5879992246627808, "loss/fcd": 1.4609375, "loss/idx": 3.0, "loss/logits": 0.13699475675821304, "step": 427 }, { "epoch": 0.006390969023212059, "grad_norm": 1.375, "grad_norm_var": 0.03588809967041016, "learning_rate": 0.0001, "loss": 1.8766, "loss/crossentropy": 2.762297749519348, "loss/fcd": 1.6875, "loss/idx": 3.0, "loss/logits": 0.18910983949899673, "step": 428 }, { "epoch": 0.006405901193827041, "grad_norm": 0.98046875, "grad_norm_var": 0.03621514638264974, "learning_rate": 0.0001, "loss": 1.7401, "loss/crossentropy": 2.8022525310516357, "loss/fcd": 1.56640625, "loss/idx": 3.0, "loss/logits": 0.17373749613761902, "step": 429 }, { "epoch": 0.0064208333644420225, "grad_norm": 1.125, "grad_norm_var": 0.021384620666503908, "learning_rate": 0.0001, "loss": 2.0262, "loss/crossentropy": 2.3079699277877808, "loss/fcd": 1.828125, "loss/idx": 3.0, "loss/logits": 0.19811610877513885, "step": 430 }, { "epoch": 0.0064357655350570035, "grad_norm": 1.0703125, "grad_norm_var": 0.021501604715983072, "learning_rate": 0.0001, "loss": 1.8896, "loss/crossentropy": 2.638156533241272, "loss/fcd": 1.6796875, "loss/idx": 3.0, "loss/logits": 0.20995599031448364, "step": 431 }, { "epoch": 0.006450697705671985, "grad_norm": 1.0390625, "grad_norm_var": 0.02177880605061849, "learning_rate": 0.0001, "loss": 1.7512, "loss/crossentropy": 2.2521796226501465, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.16525574773550034, "step": 432 }, { "epoch": 0.006465629876286966, "grad_norm": 1.0234375, "grad_norm_var": 0.022202491760253906, "learning_rate": 0.0001, "loss": 1.7417, "loss/crossentropy": 2.7757058143615723, "loss/fcd": 1.5703125, "loss/idx": 3.0, "loss/logits": 0.1713484227657318, "step": 433 }, { "epoch": 0.006480562046901948, "grad_norm": 1.3203125, "grad_norm_var": 0.02510827382405599, "learning_rate": 0.0001, "loss": 1.7516, "loss/crossentropy": 2.52429461479187, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.16173189133405685, "step": 434 }, { "epoch": 0.006495494217516929, "grad_norm": 0.88671875, "grad_norm_var": 0.026758257548014322, "learning_rate": 0.0001, "loss": 1.778, "loss/crossentropy": 2.576547145843506, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.19203433394432068, "step": 435 }, { "epoch": 0.006510426388131911, "grad_norm": 0.9296875, "grad_norm_var": 0.028347206115722657, "learning_rate": 0.0001, "loss": 1.8188, "loss/crossentropy": 2.698705315589905, "loss/fcd": 1.63671875, "loss/idx": 3.0, "loss/logits": 0.18205714225769043, "step": 436 }, { "epoch": 0.006525358558746892, "grad_norm": 1.03125, "grad_norm_var": 0.028028297424316406, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.384024500846863, "loss/fcd": 1.5, "loss/idx": 3.0, "loss/logits": 0.1624050736427307, "step": 437 }, { "epoch": 0.006540290729361874, "grad_norm": 0.85546875, "grad_norm_var": 0.031308746337890624, "learning_rate": 0.0001, "loss": 1.7476, "loss/crossentropy": 2.4632567167282104, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.16169880330562592, "step": 438 }, { "epoch": 0.0065552228999768555, "grad_norm": 0.98828125, "grad_norm_var": 0.03046716054280599, "learning_rate": 0.0001, "loss": 1.7397, "loss/crossentropy": 2.397798180580139, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.1498069018125534, "step": 439 }, { "epoch": 0.0065701550705918365, "grad_norm": 1.140625, "grad_norm_var": 0.030406634012858074, "learning_rate": 0.0001, "loss": 2.0347, "loss/crossentropy": 2.70195472240448, "loss/fcd": 1.7890625, "loss/idx": 3.0, "loss/logits": 0.2456425279378891, "step": 440 }, { "epoch": 0.006585087241206818, "grad_norm": 0.8671875, "grad_norm_var": 0.02327416737874349, "learning_rate": 0.0001, "loss": 1.7978, "loss/crossentropy": 2.505584955215454, "loss/fcd": 1.6171875, "loss/idx": 3.0, "loss/logits": 0.1805657595396042, "step": 441 }, { "epoch": 0.006600019411821799, "grad_norm": 0.9140625, "grad_norm_var": 0.023341623942057292, "learning_rate": 0.0001, "loss": 1.7628, "loss/crossentropy": 2.408290147781372, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.16904298961162567, "step": 442 }, { "epoch": 0.006614951582436781, "grad_norm": 0.9140625, "grad_norm_var": 0.022995758056640624, "learning_rate": 0.0001, "loss": 1.8013, "loss/crossentropy": 2.4535306692123413, "loss/fcd": 1.62109375, "loss/idx": 3.0, "loss/logits": 0.1801793947815895, "step": 443 }, { "epoch": 0.006629883753051762, "grad_norm": 0.93359375, "grad_norm_var": 0.014798418680826823, "learning_rate": 0.0001, "loss": 1.7924, "loss/crossentropy": 2.796666383743286, "loss/fcd": 1.60546875, "loss/idx": 3.0, "loss/logits": 0.18690849840641022, "step": 444 }, { "epoch": 0.006644815923666744, "grad_norm": 1.5859375, "grad_norm_var": 0.03603515625, "learning_rate": 0.0001, "loss": 1.8639, "loss/crossentropy": 2.8323148488998413, "loss/fcd": 1.671875, "loss/idx": 3.0, "loss/logits": 0.19198895245790482, "step": 445 }, { "epoch": 0.006659748094281725, "grad_norm": 1.125, "grad_norm_var": 0.03603515625, "learning_rate": 0.0001, "loss": 1.701, "loss/crossentropy": 2.536306142807007, "loss/fcd": 1.54296875, "loss/idx": 3.0, "loss/logits": 0.1580105796456337, "step": 446 }, { "epoch": 0.006674680264896707, "grad_norm": 0.859375, "grad_norm_var": 0.037937164306640625, "learning_rate": 0.0001, "loss": 1.6491, "loss/crossentropy": 2.4709479808807373, "loss/fcd": 1.4921875, "loss/idx": 3.0, "loss/logits": 0.15694593638181686, "step": 447 }, { "epoch": 0.0066896124355116885, "grad_norm": 1.03125, "grad_norm_var": 0.03792724609375, "learning_rate": 0.0001, "loss": 1.9007, "loss/crossentropy": 2.7610350847244263, "loss/fcd": 1.68359375, "loss/idx": 3.0, "loss/logits": 0.21714556217193604, "step": 448 }, { "epoch": 0.0067045446061266695, "grad_norm": 0.91796875, "grad_norm_var": 0.03864994049072266, "learning_rate": 0.0001, "loss": 1.6404, "loss/crossentropy": 2.662341594696045, "loss/fcd": 1.484375, "loss/idx": 3.0, "loss/logits": 0.15599173307418823, "step": 449 }, { "epoch": 0.006719476776741651, "grad_norm": 1.0703125, "grad_norm_var": 0.032505734761555986, "learning_rate": 0.0001, "loss": 1.6828, "loss/crossentropy": 2.77057945728302, "loss/fcd": 1.515625, "loss/idx": 3.0, "loss/logits": 0.16715113073587418, "step": 450 }, { "epoch": 0.006734408947356632, "grad_norm": 1.4921875, "grad_norm_var": 0.04601643880208333, "learning_rate": 0.0001, "loss": 1.7714, "loss/crossentropy": 2.659293055534363, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.17766769975423813, "step": 451 }, { "epoch": 0.006749341117971614, "grad_norm": 0.875, "grad_norm_var": 0.04701512654622396, "learning_rate": 0.0001, "loss": 1.602, "loss/crossentropy": 2.4067925214767456, "loss/fcd": 1.453125, "loss/idx": 3.0, "loss/logits": 0.14886727929115295, "step": 452 }, { "epoch": 0.006764273288586595, "grad_norm": 1.0703125, "grad_norm_var": 0.04707743326822917, "learning_rate": 0.0001, "loss": 1.8666, "loss/crossentropy": 2.4434362649917603, "loss/fcd": 1.6875, "loss/idx": 3.0, "loss/logits": 0.179054394364357, "step": 453 }, { "epoch": 0.006779205459201577, "grad_norm": 0.8359375, "grad_norm_var": 0.047581926981608076, "learning_rate": 0.0001, "loss": 1.6547, "loss/crossentropy": 2.508568048477173, "loss/fcd": 1.49609375, "loss/idx": 3.0, "loss/logits": 0.15855654329061508, "step": 454 }, { "epoch": 0.006794137629816558, "grad_norm": 0.8515625, "grad_norm_var": 0.04967142740885417, "learning_rate": 0.0001, "loss": 1.6702, "loss/crossentropy": 2.6750309467315674, "loss/fcd": 1.5078125, "loss/idx": 3.0, "loss/logits": 0.16242003440856934, "step": 455 }, { "epoch": 0.00680906980043154, "grad_norm": 0.7890625, "grad_norm_var": 0.052223459879557295, "learning_rate": 0.0001, "loss": 1.7029, "loss/crossentropy": 2.411786437034607, "loss/fcd": 1.53515625, "loss/idx": 3.0, "loss/logits": 0.16775980591773987, "step": 456 }, { "epoch": 0.0068240019710465215, "grad_norm": 1.109375, "grad_norm_var": 0.05133260091145833, "learning_rate": 0.0001, "loss": 1.7541, "loss/crossentropy": 2.7279776334762573, "loss/fcd": 1.58203125, "loss/idx": 3.0, "loss/logits": 0.17208966612815857, "step": 457 }, { "epoch": 0.0068389341416615025, "grad_norm": 0.86328125, "grad_norm_var": 0.05223433176676432, "learning_rate": 0.0001, "loss": 1.8433, "loss/crossentropy": 2.8027533292770386, "loss/fcd": 1.64453125, "loss/idx": 3.0, "loss/logits": 0.19876495003700256, "step": 458 }, { "epoch": 0.006853866312276484, "grad_norm": 0.87890625, "grad_norm_var": 0.05280939737955729, "learning_rate": 0.0001, "loss": 1.7351, "loss/crossentropy": 2.3620423078536987, "loss/fcd": 1.55859375, "loss/idx": 3.0, "loss/logits": 0.176472008228302, "step": 459 }, { "epoch": 0.006868798482891465, "grad_norm": 0.85546875, "grad_norm_var": 0.05407079060872396, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.96125864982605, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.1864861696958542, "step": 460 }, { "epoch": 0.006883730653506447, "grad_norm": 0.53125, "grad_norm_var": 0.043050130208333336, "learning_rate": 0.0001, "loss": 1.5814, "loss/crossentropy": 2.5584583282470703, "loss/fcd": 1.43359375, "loss/idx": 3.5, "loss/logits": 0.14783349633216858, "step": 461 }, { "epoch": 0.006898662824121428, "grad_norm": 0.75390625, "grad_norm_var": 0.042862892150878906, "learning_rate": 0.0001, "loss": 1.8894, "loss/crossentropy": 2.4608118534088135, "loss/fcd": 1.66015625, "loss/idx": 3.5, "loss/logits": 0.22920453548431396, "step": 462 }, { "epoch": 0.00691359499473641, "grad_norm": 0.62890625, "grad_norm_var": 0.04817072550455729, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.7459927797317505, "loss/fcd": 1.4296875, "loss/idx": 3.5, "loss/logits": 0.16794496029615402, "step": 463 }, { "epoch": 0.006928527165351392, "grad_norm": 0.5625, "grad_norm_var": 0.05430475870768229, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.567529797554016, "loss/fcd": 1.44140625, "loss/idx": 3.5, "loss/logits": 0.1725279539823532, "step": 464 }, { "epoch": 0.006943459335966373, "grad_norm": 0.63671875, "grad_norm_var": 0.057838694254557295, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.7352017164230347, "loss/fcd": 1.44921875, "loss/idx": 3.5, "loss/logits": 0.16529685258865356, "step": 465 }, { "epoch": 0.0069583915065813545, "grad_norm": 0.55859375, "grad_norm_var": 0.060045814514160155, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.4850025177001953, "loss/fcd": 1.4375, "loss/idx": 3.5, "loss/logits": 0.17964298278093338, "step": 466 }, { "epoch": 0.0069733236771963355, "grad_norm": 1.09375, "grad_norm_var": 0.034832191467285153, "learning_rate": 0.0001, "loss": 2.1071, "loss/crossentropy": 2.3254082202911377, "loss/fcd": 1.8828125, "loss/idx": 3.5, "loss/logits": 0.22428961843252182, "step": 467 }, { "epoch": 0.006988255847811317, "grad_norm": 0.57421875, "grad_norm_var": 0.03771565755208333, "learning_rate": 0.0001, "loss": 1.7908, "loss/crossentropy": 2.8251163959503174, "loss/fcd": 1.59375, "loss/idx": 3.5, "loss/logits": 0.19702401757240295, "step": 468 }, { "epoch": 0.007003188018426298, "grad_norm": 0.7421875, "grad_norm_var": 0.03205464680989583, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.5575175285339355, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.15361517667770386, "step": 469 }, { "epoch": 0.00701812018904128, "grad_norm": 0.7890625, "grad_norm_var": 0.031758626302083336, "learning_rate": 0.0001, "loss": 2.1285, "loss/crossentropy": 2.5669792890548706, "loss/fcd": 1.8671875, "loss/idx": 3.5, "loss/logits": 0.26132843643426895, "step": 470 }, { "epoch": 0.007033052359656261, "grad_norm": 0.66796875, "grad_norm_var": 0.03171380360921224, "learning_rate": 0.0001, "loss": 1.6446, "loss/crossentropy": 2.677261710166931, "loss/fcd": 1.46875, "loss/idx": 3.5, "loss/logits": 0.17586223781108856, "step": 471 }, { "epoch": 0.007047984530271243, "grad_norm": 0.59375, "grad_norm_var": 0.03313795725504557, "learning_rate": 0.0001, "loss": 1.703, "loss/crossentropy": 2.6583694219589233, "loss/fcd": 1.515625, "loss/idx": 3.5, "loss/logits": 0.1873987913131714, "step": 472 }, { "epoch": 0.007062916700886225, "grad_norm": 0.59765625, "grad_norm_var": 0.0243011474609375, "learning_rate": 0.0001, "loss": 1.7603, "loss/crossentropy": 2.4877501726150513, "loss/fcd": 1.5703125, "loss/idx": 3.5, "loss/logits": 0.19003190845251083, "step": 473 }, { "epoch": 0.007077848871501206, "grad_norm": 0.66015625, "grad_norm_var": 0.022674560546875, "learning_rate": 0.0001, "loss": 1.589, "loss/crossentropy": 2.693013906478882, "loss/fcd": 1.421875, "loss/idx": 3.5, "loss/logits": 0.16710253804922104, "step": 474 }, { "epoch": 0.0070927810421161875, "grad_norm": 0.703125, "grad_norm_var": 0.020302772521972656, "learning_rate": 0.0001, "loss": 1.7601, "loss/crossentropy": 2.572099208831787, "loss/fcd": 1.5703125, "loss/idx": 3.5, "loss/logits": 0.18978480249643326, "step": 475 }, { "epoch": 0.0071077132127311685, "grad_norm": 0.50390625, "grad_norm_var": 0.020005226135253906, "learning_rate": 0.0001, "loss": 1.6699, "loss/crossentropy": 2.449280023574829, "loss/fcd": 1.49609375, "loss/idx": 3.5, "loss/logits": 0.17377550154924393, "step": 476 }, { "epoch": 0.00712264538334615, "grad_norm": 0.59375, "grad_norm_var": 0.019156837463378908, "learning_rate": 0.0001, "loss": 1.5723, "loss/crossentropy": 2.59575355052948, "loss/fcd": 1.41015625, "loss/idx": 3.5, "loss/logits": 0.1621238887310028, "step": 477 }, { "epoch": 0.007137577553961131, "grad_norm": 0.8203125, "grad_norm_var": 0.02020848592122396, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.577454090118408, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.1692582219839096, "step": 478 }, { "epoch": 0.007152509724576113, "grad_norm": 0.52734375, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 1.6061, "loss/crossentropy": 2.471789002418518, "loss/fcd": 1.45703125, "loss/idx": 3.5, "loss/logits": 0.14902313798666, "step": 479 }, { "epoch": 0.007167441895191094, "grad_norm": 0.53515625, "grad_norm_var": 0.02183221181233724, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.617826461791992, "loss/fcd": 1.35546875, "loss/idx": 3.5, "loss/logits": 0.14270956814289093, "step": 480 }, { "epoch": 0.007182374065806076, "grad_norm": 0.484375, "grad_norm_var": 0.02380345662434896, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.6413131952285767, "loss/fcd": 1.421875, "loss/idx": 3.5, "loss/logits": 0.16986460238695145, "step": 481 }, { "epoch": 0.007197306236421058, "grad_norm": 0.7578125, "grad_norm_var": 0.02378075917561849, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.6517175436019897, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.18155179917812347, "step": 482 }, { "epoch": 0.007212238407036039, "grad_norm": 0.55078125, "grad_norm_var": 0.011187489827473958, "learning_rate": 0.0001, "loss": 1.656, "loss/crossentropy": 2.555253267288208, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.1832970231771469, "step": 483 }, { "epoch": 0.0072271705776510205, "grad_norm": 0.57421875, "grad_norm_var": 0.011187489827473958, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.5745534896850586, "loss/fcd": 1.359375, "loss/idx": 3.5, "loss/logits": 0.1399783045053482, "step": 484 }, { "epoch": 0.0072421027482660015, "grad_norm": 0.625, "grad_norm_var": 0.010313924153645833, "learning_rate": 0.0001, "loss": 1.7049, "loss/crossentropy": 2.673888325691223, "loss/fcd": 1.52734375, "loss/idx": 3.5, "loss/logits": 0.1775348037481308, "step": 485 }, { "epoch": 0.007257034918880983, "grad_norm": 0.5625, "grad_norm_var": 0.008536529541015626, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.710415840148926, "loss/fcd": 1.3515625, "loss/idx": 3.5, "loss/logits": 0.13889621198177338, "step": 486 }, { "epoch": 0.007271967089495964, "grad_norm": 0.5234375, "grad_norm_var": 0.008722368876139324, "learning_rate": 0.0001, "loss": 1.6983, "loss/crossentropy": 2.6029082536697388, "loss/fcd": 1.50390625, "loss/idx": 3.5, "loss/logits": 0.19442546367645264, "step": 487 }, { "epoch": 0.007286899260110946, "grad_norm": 1.015625, "grad_norm_var": 0.019447771708170573, "learning_rate": 0.0001, "loss": 1.8318, "loss/crossentropy": 2.562933087348938, "loss/fcd": 1.66015625, "loss/idx": 3.5, "loss/logits": 0.17163604497909546, "step": 488 }, { "epoch": 0.007301831430725927, "grad_norm": 0.71484375, "grad_norm_var": 0.019844500223795573, "learning_rate": 0.0001, "loss": 1.9322, "loss/crossentropy": 2.3318947553634644, "loss/fcd": 1.70703125, "loss/idx": 3.5, "loss/logits": 0.22517724335193634, "step": 489 }, { "epoch": 0.007316763601340909, "grad_norm": 0.66796875, "grad_norm_var": 0.019875017801920573, "learning_rate": 0.0001, "loss": 1.6361, "loss/crossentropy": 2.6366230249404907, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.1556784212589264, "step": 490 }, { "epoch": 0.007331695771955891, "grad_norm": 0.6328125, "grad_norm_var": 0.01954542795817057, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.5596346855163574, "loss/fcd": 1.484375, "loss/idx": 3.5, "loss/logits": 0.17000765353441238, "step": 491 }, { "epoch": 0.007346627942570872, "grad_norm": 0.62890625, "grad_norm_var": 0.018410174051920573, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.6965473890304565, "loss/fcd": 1.49609375, "loss/idx": 3.5, "loss/logits": 0.16991965472698212, "step": 492 }, { "epoch": 0.0073615601131858536, "grad_norm": 0.55859375, "grad_norm_var": 0.01869684855143229, "learning_rate": 0.0001, "loss": 1.6618, "loss/crossentropy": 2.6663613319396973, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.18130898475646973, "step": 493 }, { "epoch": 0.0073764922838008345, "grad_norm": 0.57421875, "grad_norm_var": 0.016441790262858073, "learning_rate": 0.0001, "loss": 1.5194, "loss/crossentropy": 2.4370299577713013, "loss/fcd": 1.3671875, "loss/idx": 3.5, "loss/logits": 0.1522502452135086, "step": 494 }, { "epoch": 0.007391424454415816, "grad_norm": 0.54296875, "grad_norm_var": 0.016262245178222657, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.5481144189834595, "loss/fcd": 1.4296875, "loss/idx": 3.5, "loss/logits": 0.1577276885509491, "step": 495 }, { "epoch": 0.007406356625030797, "grad_norm": 0.51953125, "grad_norm_var": 0.01645806630452474, "learning_rate": 0.0001, "loss": 1.6746, "loss/crossentropy": 2.5940628051757812, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.19416546076536179, "step": 496 }, { "epoch": 0.007421288795645779, "grad_norm": 0.515625, "grad_norm_var": 0.015950457255045573, "learning_rate": 0.0001, "loss": 1.5521, "loss/crossentropy": 2.6910064220428467, "loss/fcd": 1.390625, "loss/idx": 3.5, "loss/logits": 0.16150504350662231, "step": 497 }, { "epoch": 0.00743622096626076, "grad_norm": 0.51171875, "grad_norm_var": 0.015305582682291667, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.6522045135498047, "loss/fcd": 1.4921875, "loss/idx": 3.5, "loss/logits": 0.18767046928405762, "step": 498 }, { "epoch": 0.007451153136875742, "grad_norm": 0.53515625, "grad_norm_var": 0.0154388427734375, "learning_rate": 0.0001, "loss": 1.6198, "loss/crossentropy": 2.6188907623291016, "loss/fcd": 1.453125, "loss/idx": 3.5, "loss/logits": 0.1666594073176384, "step": 499 }, { "epoch": 0.007466085307490724, "grad_norm": 0.58203125, "grad_norm_var": 0.015409088134765625, "learning_rate": 0.0001, "loss": 1.7251, "loss/crossentropy": 2.6162188053131104, "loss/fcd": 1.5390625, "loss/idx": 3.5, "loss/logits": 0.1860610470175743, "step": 500 }, { "epoch": 0.007481017478105705, "grad_norm": 1.7890625, "grad_norm_var": 0.10290323893229167, "learning_rate": 0.0001, "loss": 1.859, "loss/crossentropy": 2.9810004234313965, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.1871199607849121, "step": 501 }, { "epoch": 0.0074959496487206866, "grad_norm": 3.328125, "grad_norm_var": 0.5377329508463542, "learning_rate": 0.0001, "loss": 1.8599, "loss/crossentropy": 2.6053121089935303, "loss/fcd": 1.67578125, "loss/idx": 4.0, "loss/logits": 0.18407516926527023, "step": 502 }, { "epoch": 0.0075108818193356675, "grad_norm": 2.84375, "grad_norm_var": 0.7724077860514323, "learning_rate": 0.0001, "loss": 1.8754, "loss/crossentropy": 2.6646742820739746, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.18401946127414703, "step": 503 }, { "epoch": 0.007525813989950649, "grad_norm": 2.6875, "grad_norm_var": 0.9511329650878906, "learning_rate": 0.0001, "loss": 1.9323, "loss/crossentropy": 2.666251301765442, "loss/fcd": 1.734375, "loss/idx": 4.0, "loss/logits": 0.1978878602385521, "step": 504 }, { "epoch": 0.00754074616056563, "grad_norm": 2.671875, "grad_norm_var": 1.0894693374633788, "learning_rate": 0.0001, "loss": 1.961, "loss/crossentropy": 2.8120644092559814, "loss/fcd": 1.76171875, "loss/idx": 4.0, "loss/logits": 0.19924385100603104, "step": 505 }, { "epoch": 0.007555678331180612, "grad_norm": 2.421875, "grad_norm_var": 1.1516153971354166, "learning_rate": 0.0001, "loss": 1.9617, "loss/crossentropy": 2.4772077798843384, "loss/fcd": 1.7578125, "loss/idx": 4.0, "loss/logits": 0.20385053008794785, "step": 506 }, { "epoch": 0.007570610501795593, "grad_norm": 2.234375, "grad_norm_var": 1.1621986389160157, "learning_rate": 0.0001, "loss": 2.0677, "loss/crossentropy": 2.6911627054214478, "loss/fcd": 1.83984375, "loss/idx": 4.0, "loss/logits": 0.2278064265847206, "step": 507 }, { "epoch": 0.007585542672410575, "grad_norm": 2.0625, "grad_norm_var": 1.136742083231608, "learning_rate": 0.0001, "loss": 1.9551, "loss/crossentropy": 2.6768856048583984, "loss/fcd": 1.7578125, "loss/idx": 4.0, "loss/logits": 0.19726867973804474, "step": 508 }, { "epoch": 0.007600474843025557, "grad_norm": 1.6953125, "grad_norm_var": 1.0712290445963542, "learning_rate": 0.0001, "loss": 1.9038, "loss/crossentropy": 2.9186242818832397, "loss/fcd": 1.7109375, "loss/idx": 4.0, "loss/logits": 0.19286338984966278, "step": 509 }, { "epoch": 0.007615407013640538, "grad_norm": 1.5078125, "grad_norm_var": 0.9986719131469727, "learning_rate": 0.0001, "loss": 1.9477, "loss/crossentropy": 2.547776460647583, "loss/fcd": 1.74609375, "loss/idx": 4.0, "loss/logits": 0.20161531120538712, "step": 510 }, { "epoch": 0.00763033918425552, "grad_norm": 1.3125, "grad_norm_var": 0.9217814127604167, "learning_rate": 0.0001, "loss": 1.8293, "loss/crossentropy": 2.7642080783843994, "loss/fcd": 1.64453125, "loss/idx": 4.0, "loss/logits": 0.1847827509045601, "step": 511 }, { "epoch": 0.0076452713548705005, "grad_norm": 1.5, "grad_norm_var": 0.8273889541625976, "learning_rate": 0.0001, "loss": 1.922, "loss/crossentropy": 2.5652897357940674, "loss/fcd": 1.72265625, "loss/idx": 4.0, "loss/logits": 0.19934390485286713, "step": 512 }, { "epoch": 0.007660203525485482, "grad_norm": 1.28125, "grad_norm_var": 0.7367451349894206, "learning_rate": 0.0001, "loss": 2.081, "loss/crossentropy": 2.3452470302581787, "loss/fcd": 1.84375, "loss/idx": 4.0, "loss/logits": 0.23722243309020996, "step": 513 }, { "epoch": 0.007675135696100463, "grad_norm": 1.7890625, "grad_norm_var": 0.6175554911295573, "learning_rate": 0.0001, "loss": 2.1585, "loss/crossentropy": 2.6333130598068237, "loss/fcd": 1.9453125, "loss/idx": 4.0, "loss/logits": 0.21314261853694916, "step": 514 }, { "epoch": 0.007690067866715445, "grad_norm": 1.203125, "grad_norm_var": 0.5247639338175456, "learning_rate": 0.0001, "loss": 2.0899, "loss/crossentropy": 2.723830461502075, "loss/fcd": 1.84765625, "loss/idx": 4.0, "loss/logits": 0.24222549051046371, "step": 515 }, { "epoch": 0.007705000037330426, "grad_norm": 1.859375, "grad_norm_var": 0.3968424479166667, "learning_rate": 0.0001, "loss": 2.0611, "loss/crossentropy": 2.3409069776535034, "loss/fcd": 1.828125, "loss/idx": 4.0, "loss/logits": 0.23299187421798706, "step": 516 }, { "epoch": 0.007719932207945408, "grad_norm": 1.0625, "grad_norm_var": 0.4514055887858073, "learning_rate": 0.0001, "loss": 1.7604, "loss/crossentropy": 2.5428874492645264, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.16276978701353073, "step": 517 }, { "epoch": 0.00773486437856039, "grad_norm": 0.94921875, "grad_norm_var": 0.37315413157145183, "learning_rate": 0.0001, "loss": 2.0376, "loss/crossentropy": 2.533219337463379, "loss/fcd": 1.80078125, "loss/idx": 4.0, "loss/logits": 0.2368319183588028, "step": 518 }, { "epoch": 0.007749796549175371, "grad_norm": 0.96875, "grad_norm_var": 0.33634993235270183, "learning_rate": 0.0001, "loss": 1.8927, "loss/crossentropy": 2.352767586708069, "loss/fcd": 1.70703125, "loss/idx": 4.0, "loss/logits": 0.18567398935556412, "step": 519 }, { "epoch": 0.007764728719790353, "grad_norm": 1.1171875, "grad_norm_var": 0.28380171457926434, "learning_rate": 0.0001, "loss": 2.0012, "loss/crossentropy": 2.597265362739563, "loss/fcd": 1.76953125, "loss/idx": 4.0, "loss/logits": 0.23170067369937897, "step": 520 }, { "epoch": 0.0077796608904053335, "grad_norm": 1.125, "grad_norm_var": 0.21275221506754557, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.6241856813430786, "loss/fcd": 1.60546875, "loss/idx": 4.0, "loss/logits": 0.17502596974372864, "step": 521 }, { "epoch": 0.007794593061020315, "grad_norm": 1.203125, "grad_norm_var": 0.15669447580973309, "learning_rate": 0.0001, "loss": 1.915, "loss/crossentropy": 2.6768182516098022, "loss/fcd": 1.7109375, "loss/idx": 4.0, "loss/logits": 0.2040882408618927, "step": 522 }, { "epoch": 0.007809525231635296, "grad_norm": 1.3671875, "grad_norm_var": 0.11062513987223307, "learning_rate": 0.0001, "loss": 1.7797, "loss/crossentropy": 2.5710521936416626, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.16644778847694397, "step": 523 }, { "epoch": 0.007824457402250278, "grad_norm": 1.0859375, "grad_norm_var": 0.08074334462483725, "learning_rate": 0.0001, "loss": 2.0208, "loss/crossentropy": 2.3945902585983276, "loss/fcd": 1.78515625, "loss/idx": 4.0, "loss/logits": 0.23565331101417542, "step": 524 }, { "epoch": 0.007839389572865259, "grad_norm": 0.921875, "grad_norm_var": 0.07882989247639974, "learning_rate": 0.0001, "loss": 1.6759, "loss/crossentropy": 2.6742191314697266, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.15633848309516907, "step": 525 }, { "epoch": 0.007854321743480242, "grad_norm": 1.0078125, "grad_norm_var": 0.07832533518473307, "learning_rate": 0.0001, "loss": 1.7404, "loss/crossentropy": 2.4373925924301147, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18178628385066986, "step": 526 }, { "epoch": 0.007869253914095223, "grad_norm": 1.140625, "grad_norm_var": 0.07838687896728516, "learning_rate": 0.0001, "loss": 1.8653, "loss/crossentropy": 2.4041073322296143, "loss/fcd": 1.66796875, "loss/idx": 4.0, "loss/logits": 0.19732805341482162, "step": 527 }, { "epoch": 0.007884186084710204, "grad_norm": 1.0390625, "grad_norm_var": 0.0746957778930664, "learning_rate": 0.0001, "loss": 1.8203, "loss/crossentropy": 2.6815097332000732, "loss/fcd": 1.63671875, "loss/idx": 4.0, "loss/logits": 0.18355443328619003, "step": 528 }, { "epoch": 0.007899118255325185, "grad_norm": 2.0, "grad_norm_var": 0.11524244944254557, "learning_rate": 0.0001, "loss": 2.093, "loss/crossentropy": 2.7026476860046387, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.40154484659433365, "step": 529 }, { "epoch": 0.007914050425940167, "grad_norm": 0.88671875, "grad_norm_var": 0.10007120768229166, "learning_rate": 0.0001, "loss": 1.78, "loss/crossentropy": 2.686976909637451, "loss/fcd": 1.59375, "loss/idx": 4.0, "loss/logits": 0.18629659712314606, "step": 530 }, { "epoch": 0.007928982596555148, "grad_norm": 1.1796875, "grad_norm_var": 0.10004450480143229, "learning_rate": 0.0001, "loss": 1.8566, "loss/crossentropy": 2.3627219200134277, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.1846960484981537, "step": 531 }, { "epoch": 0.00794391476717013, "grad_norm": 1.0234375, "grad_norm_var": 0.0682342529296875, "learning_rate": 0.0001, "loss": 1.9221, "loss/crossentropy": 2.5744149684906006, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.21895581483840942, "step": 532 }, { "epoch": 0.00795884693778511, "grad_norm": 0.84375, "grad_norm_var": 0.0731903076171875, "learning_rate": 0.0001, "loss": 1.6618, "loss/crossentropy": 2.7105607986450195, "loss/fcd": 1.5078125, "loss/idx": 4.0, "loss/logits": 0.15397943556308746, "step": 533 }, { "epoch": 0.007973779108400093, "grad_norm": 1.21875, "grad_norm_var": 0.07172946929931641, "learning_rate": 0.0001, "loss": 1.8617, "loss/crossentropy": 2.6265203952789307, "loss/fcd": 1.6875, "loss/idx": 4.0, "loss/logits": 0.17423325031995773, "step": 534 }, { "epoch": 0.007988711279015074, "grad_norm": 0.8203125, "grad_norm_var": 0.07635847727457683, "learning_rate": 0.0001, "loss": 1.7946, "loss/crossentropy": 2.5890642404556274, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.1930510774254799, "step": 535 }, { "epoch": 0.008003643449630055, "grad_norm": 1.2578125, "grad_norm_var": 0.07747084299723307, "learning_rate": 0.0001, "loss": 1.91, "loss/crossentropy": 2.8161873817443848, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.21079972386360168, "step": 536 }, { "epoch": 0.008018575620245038, "grad_norm": 0.85546875, "grad_norm_var": 0.08228327433268229, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.6365318298339844, "loss/fcd": 1.57421875, "loss/idx": 4.0, "loss/logits": 0.19160092622041702, "step": 537 }, { "epoch": 0.008033507790860019, "grad_norm": 0.8515625, "grad_norm_var": 0.08591105143229166, "learning_rate": 0.0001, "loss": 1.7883, "loss/crossentropy": 2.6011857986450195, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.19066426903009415, "step": 538 }, { "epoch": 0.008048439961475, "grad_norm": 0.87109375, "grad_norm_var": 0.08320611317952474, "learning_rate": 0.0001, "loss": 1.8642, "loss/crossentropy": 2.594154477119446, "loss/fcd": 1.6640625, "loss/idx": 4.0, "loss/logits": 0.20009979605674744, "step": 539 }, { "epoch": 0.00806337213208998, "grad_norm": 0.96875, "grad_norm_var": 0.08370202382405599, "learning_rate": 0.0001, "loss": 2.02, "loss/crossentropy": 2.350104331970215, "loss/fcd": 1.80078125, "loss/idx": 4.0, "loss/logits": 0.21917122602462769, "step": 540 }, { "epoch": 0.008078304302704963, "grad_norm": 0.703125, "grad_norm_var": 0.09058780670166015, "learning_rate": 0.0001, "loss": 1.6337, "loss/crossentropy": 2.5559120178222656, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.1532716527581215, "step": 541 }, { "epoch": 0.008093236473319944, "grad_norm": 0.8359375, "grad_norm_var": 0.09321180979410808, "learning_rate": 0.0001, "loss": 1.8395, "loss/crossentropy": 2.4848121404647827, "loss/fcd": 1.6484375, "loss/idx": 4.0, "loss/logits": 0.1910661906003952, "step": 542 }, { "epoch": 0.008108168643934925, "grad_norm": 0.83984375, "grad_norm_var": 0.09446996053059896, "learning_rate": 0.0001, "loss": 1.7239, "loss/crossentropy": 2.6207433938980103, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.17703650146722794, "step": 543 }, { "epoch": 0.008123100814549908, "grad_norm": 0.91015625, "grad_norm_var": 0.09504693349202474, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.741000533103943, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.1650281846523285, "step": 544 }, { "epoch": 0.008138032985164889, "grad_norm": 0.8359375, "grad_norm_var": 0.025172869364420574, "learning_rate": 0.0001, "loss": 1.8462, "loss/crossentropy": 2.739645481109619, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.21342986822128296, "step": 545 }, { "epoch": 0.00815296515577987, "grad_norm": 1.484375, "grad_norm_var": 0.04393717447916667, "learning_rate": 0.0001, "loss": 2.2019, "loss/crossentropy": 2.5596991777420044, "loss/fcd": 1.93359375, "loss/idx": 4.0, "loss/logits": 0.2683301120996475, "step": 546 }, { "epoch": 0.00816789732639485, "grad_norm": 0.89453125, "grad_norm_var": 0.040999285380045575, "learning_rate": 0.0001, "loss": 2.1296, "loss/crossentropy": 2.631078362464905, "loss/fcd": 1.87890625, "loss/idx": 4.0, "loss/logits": 0.25071533769369125, "step": 547 }, { "epoch": 0.008182829497009833, "grad_norm": 0.98046875, "grad_norm_var": 0.04069925944010417, "learning_rate": 0.0001, "loss": 1.766, "loss/crossentropy": 2.6593146324157715, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.18003150820732117, "step": 548 }, { "epoch": 0.008197761667624814, "grad_norm": 0.90625, "grad_norm_var": 0.0400726318359375, "learning_rate": 0.0001, "loss": 1.6572, "loss/crossentropy": 2.527645230293274, "loss/fcd": 1.5, "loss/idx": 4.0, "loss/logits": 0.1571880429983139, "step": 549 }, { "epoch": 0.008212693838239795, "grad_norm": 0.8359375, "grad_norm_var": 0.035623931884765626, "learning_rate": 0.0001, "loss": 1.7319, "loss/crossentropy": 2.492905378341675, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.18110989034175873, "step": 550 }, { "epoch": 0.008227626008854776, "grad_norm": 0.95703125, "grad_norm_var": 0.034825070699055986, "learning_rate": 0.0001, "loss": 1.6543, "loss/crossentropy": 2.4883817434310913, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.14261526614427567, "step": 551 }, { "epoch": 0.008242558179469759, "grad_norm": 1.171875, "grad_norm_var": 0.03160800933837891, "learning_rate": 0.0001, "loss": 2.1634, "loss/crossentropy": 2.388165831565857, "loss/fcd": 1.89453125, "loss/idx": 4.0, "loss/logits": 0.2688213065266609, "step": 552 }, { "epoch": 0.00825749035008474, "grad_norm": 0.9453125, "grad_norm_var": 0.031202952067057293, "learning_rate": 0.0001, "loss": 1.5087, "loss/crossentropy": 2.4662253856658936, "loss/fcd": 1.37109375, "loss/idx": 4.0, "loss/logits": 0.13757767528295517, "step": 553 }, { "epoch": 0.008272422520699721, "grad_norm": 0.8671875, "grad_norm_var": 0.031040191650390625, "learning_rate": 0.0001, "loss": 1.7147, "loss/crossentropy": 2.6518259048461914, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.16782306134700775, "step": 554 }, { "epoch": 0.008287354691314704, "grad_norm": 0.953125, "grad_norm_var": 0.030729103088378906, "learning_rate": 0.0001, "loss": 1.7046, "loss/crossentropy": 2.50111985206604, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.1694774478673935, "step": 555 }, { "epoch": 0.008302286861929685, "grad_norm": 0.87890625, "grad_norm_var": 0.030926513671875, "learning_rate": 0.0001, "loss": 1.8382, "loss/crossentropy": 2.4261611700057983, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.20543111115694046, "step": 556 }, { "epoch": 0.008317219032544666, "grad_norm": 1.71875, "grad_norm_var": 0.0636566162109375, "learning_rate": 0.0001, "loss": 2.1674, "loss/crossentropy": 2.5285680294036865, "loss/fcd": 1.9140625, "loss/idx": 4.0, "loss/logits": 0.25333091616630554, "step": 557 }, { "epoch": 0.008332151203159647, "grad_norm": 0.8046875, "grad_norm_var": 0.06440531412760417, "learning_rate": 0.0001, "loss": 1.8293, "loss/crossentropy": 2.547691226005554, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.2003782019019127, "step": 558 }, { "epoch": 0.00834708337377463, "grad_norm": 1.015625, "grad_norm_var": 0.06260573069254557, "learning_rate": 0.0001, "loss": 1.7806, "loss/crossentropy": 2.4365475177764893, "loss/fcd": 1.60546875, "loss/idx": 4.0, "loss/logits": 0.1750938817858696, "step": 559 }, { "epoch": 0.00836201554438961, "grad_norm": 1.0234375, "grad_norm_var": 0.061899566650390626, "learning_rate": 0.0001, "loss": 2.228, "loss/crossentropy": 2.2742353677749634, "loss/fcd": 1.96484375, "loss/idx": 4.0, "loss/logits": 0.263118177652359, "step": 560 }, { "epoch": 0.008376947715004591, "grad_norm": 0.8671875, "grad_norm_var": 0.06120580037434896, "learning_rate": 0.0001, "loss": 1.9159, "loss/crossentropy": 2.607572078704834, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.2166425883769989, "step": 561 }, { "epoch": 0.008391879885619574, "grad_norm": 0.96484375, "grad_norm_var": 0.045841407775878903, "learning_rate": 0.0001, "loss": 1.9524, "loss/crossentropy": 2.3942710161209106, "loss/fcd": 1.74609375, "loss/idx": 4.0, "loss/logits": 0.20629461109638214, "step": 562 }, { "epoch": 0.008406812056234555, "grad_norm": 0.796875, "grad_norm_var": 0.04763590494791667, "learning_rate": 0.0001, "loss": 1.7762, "loss/crossentropy": 2.674455165863037, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.1902579665184021, "step": 563 }, { "epoch": 0.008421744226849536, "grad_norm": 1.0078125, "grad_norm_var": 0.047682634989420575, "learning_rate": 0.0001, "loss": 1.742, "loss/crossentropy": 2.687700867652893, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18337178230285645, "step": 564 }, { "epoch": 0.008436676397464517, "grad_norm": 0.8671875, "grad_norm_var": 0.04817345937093099, "learning_rate": 0.0001, "loss": 1.7458, "loss/crossentropy": 2.496551990509033, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.1754860207438469, "step": 565 }, { "epoch": 0.0084516085680795, "grad_norm": 1.046875, "grad_norm_var": 0.046910031636555986, "learning_rate": 0.0001, "loss": 1.8244, "loss/crossentropy": 2.7592931985855103, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.19546330720186234, "step": 566 }, { "epoch": 0.00846654073869448, "grad_norm": 0.9765625, "grad_norm_var": 0.04684041341145833, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.685667395591736, "loss/fcd": 1.484375, "loss/idx": 4.0, "loss/logits": 0.1539781242609024, "step": 567 }, { "epoch": 0.008481472909309461, "grad_norm": 0.98046875, "grad_norm_var": 0.04459425608317057, "learning_rate": 0.0001, "loss": 1.8716, "loss/crossentropy": 2.4638712406158447, "loss/fcd": 1.65625, "loss/idx": 4.0, "loss/logits": 0.21538397669792175, "step": 568 }, { "epoch": 0.008496405079924444, "grad_norm": 0.89453125, "grad_norm_var": 0.04500503540039062, "learning_rate": 0.0001, "loss": 1.7427, "loss/crossentropy": 2.3749929666519165, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.18019939959049225, "step": 569 }, { "epoch": 0.008511337250539425, "grad_norm": 0.87890625, "grad_norm_var": 0.044838905334472656, "learning_rate": 0.0001, "loss": 1.7957, "loss/crossentropy": 2.600667715072632, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.18243755400180817, "step": 570 }, { "epoch": 0.008526269421154406, "grad_norm": 1.484375, "grad_norm_var": 0.06059309641520182, "learning_rate": 0.0001, "loss": 2.0295, "loss/crossentropy": 2.462936758995056, "loss/fcd": 1.8203125, "loss/idx": 4.0, "loss/logits": 0.20919711887836456, "step": 571 }, { "epoch": 0.008541201591769387, "grad_norm": 1.0390625, "grad_norm_var": 0.059334055582682295, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 2.69628369808197, "loss/fcd": 1.3984375, "loss/idx": 4.0, "loss/logits": 0.1470213457942009, "step": 572 }, { "epoch": 0.00855613376238437, "grad_norm": 0.88671875, "grad_norm_var": 0.025410906473795573, "learning_rate": 0.0001, "loss": 1.8762, "loss/crossentropy": 2.385751962661743, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.19262713938951492, "step": 573 }, { "epoch": 0.00857106593299935, "grad_norm": 0.8984375, "grad_norm_var": 0.023881975809733072, "learning_rate": 0.0001, "loss": 1.7466, "loss/crossentropy": 2.782795786857605, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.16844282299280167, "step": 574 }, { "epoch": 0.008585998103614332, "grad_norm": 1.046875, "grad_norm_var": 0.024104754130045574, "learning_rate": 0.0001, "loss": 1.8152, "loss/crossentropy": 2.5910396575927734, "loss/fcd": 1.6171875, "loss/idx": 4.0, "loss/logits": 0.19796662032604218, "step": 575 }, { "epoch": 0.008600930274229313, "grad_norm": 0.74609375, "grad_norm_var": 0.02726008097330729, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.625463128089905, "loss/fcd": 1.4453125, "loss/idx": 4.0, "loss/logits": 0.15583696961402893, "step": 576 }, { "epoch": 0.008615862444844295, "grad_norm": 0.86328125, "grad_norm_var": 0.02731011708577474, "learning_rate": 0.0001, "loss": 1.6691, "loss/crossentropy": 2.7770665884017944, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.15351397544145584, "step": 577 }, { "epoch": 0.008630794615459276, "grad_norm": 1.359375, "grad_norm_var": 0.037231190999348955, "learning_rate": 0.0001, "loss": 1.8882, "loss/crossentropy": 2.6627167463302612, "loss/fcd": 1.66015625, "loss/idx": 4.0, "loss/logits": 0.22807708382606506, "step": 578 }, { "epoch": 0.008645726786074257, "grad_norm": 0.91015625, "grad_norm_var": 0.03517907460530599, "learning_rate": 0.0001, "loss": 1.7459, "loss/crossentropy": 2.574559807777405, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.19124917685985565, "step": 579 }, { "epoch": 0.00866065895668924, "grad_norm": 0.97265625, "grad_norm_var": 0.035186513264973955, "learning_rate": 0.0001, "loss": 1.7128, "loss/crossentropy": 2.4655394554138184, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.18545571714639664, "step": 580 }, { "epoch": 0.00867559112730422, "grad_norm": 0.9140625, "grad_norm_var": 0.034551747639973956, "learning_rate": 0.0001, "loss": 1.6711, "loss/crossentropy": 2.4528276920318604, "loss/fcd": 1.5, "loss/idx": 4.0, "loss/logits": 0.17109280824661255, "step": 581 }, { "epoch": 0.008690523297919202, "grad_norm": 1.234375, "grad_norm_var": 0.03807957967122396, "learning_rate": 0.0001, "loss": 1.926, "loss/crossentropy": 2.6185566186904907, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.22679200768470764, "step": 582 }, { "epoch": 0.008705455468534183, "grad_norm": 0.7734375, "grad_norm_var": 0.04143854777018229, "learning_rate": 0.0001, "loss": 1.7634, "loss/crossentropy": 2.3130193948745728, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.17749568819999695, "step": 583 }, { "epoch": 0.008720387639149165, "grad_norm": 0.875, "grad_norm_var": 0.04230543772379557, "learning_rate": 0.0001, "loss": 1.7338, "loss/crossentropy": 2.6866395473480225, "loss/fcd": 1.54296875, "loss/idx": 4.0, "loss/logits": 0.1908554881811142, "step": 584 }, { "epoch": 0.008735319809764146, "grad_norm": 0.95703125, "grad_norm_var": 0.04178663889567057, "learning_rate": 0.0001, "loss": 1.6385, "loss/crossentropy": 2.8928191661834717, "loss/fcd": 1.47265625, "loss/idx": 4.0, "loss/logits": 0.16579563915729523, "step": 585 }, { "epoch": 0.008750251980379127, "grad_norm": 0.859375, "grad_norm_var": 0.04209976196289063, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.532406449317932, "loss/fcd": 1.45703125, "loss/idx": 4.0, "loss/logits": 0.16028352081775665, "step": 586 }, { "epoch": 0.00876518415099411, "grad_norm": 0.83203125, "grad_norm_var": 0.025589434305826823, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.5307902097702026, "loss/fcd": 1.56640625, "loss/idx": 4.0, "loss/logits": 0.18504076451063156, "step": 587 }, { "epoch": 0.008780116321609091, "grad_norm": 1.03125, "grad_norm_var": 0.025498390197753906, "learning_rate": 0.0001, "loss": 1.8988, "loss/crossentropy": 2.645454168319702, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.20737073570489883, "step": 588 }, { "epoch": 0.008795048492224072, "grad_norm": 0.9765625, "grad_norm_var": 0.025274658203125, "learning_rate": 0.0001, "loss": 1.6831, "loss/crossentropy": 2.5572890043258667, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.16751381009817123, "step": 589 }, { "epoch": 0.008809980662839053, "grad_norm": 0.85546875, "grad_norm_var": 0.02570336659749349, "learning_rate": 0.0001, "loss": 1.687, "loss/crossentropy": 2.4566808938980103, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.1830824688076973, "step": 590 }, { "epoch": 0.008824912833454036, "grad_norm": 0.83203125, "grad_norm_var": 0.02582575480143229, "learning_rate": 0.0001, "loss": 1.7327, "loss/crossentropy": 2.6418092250823975, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.17799954861402512, "step": 591 }, { "epoch": 0.008839845004069017, "grad_norm": 1.046875, "grad_norm_var": 0.02382348378499349, "learning_rate": 0.0001, "loss": 1.7224, "loss/crossentropy": 2.5941314697265625, "loss/fcd": 1.54296875, "loss/idx": 4.0, "loss/logits": 0.17938321828842163, "step": 592 }, { "epoch": 0.008854777174683998, "grad_norm": 0.80078125, "grad_norm_var": 0.024838701883951823, "learning_rate": 0.0001, "loss": 1.7012, "loss/crossentropy": 2.594764471054077, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.18560975790023804, "step": 593 }, { "epoch": 0.008869709345298979, "grad_norm": 0.84765625, "grad_norm_var": 0.0134033203125, "learning_rate": 0.0001, "loss": 1.7647, "loss/crossentropy": 2.653563141822815, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.21787381172180176, "step": 594 }, { "epoch": 0.008884641515913961, "grad_norm": 0.8671875, "grad_norm_var": 0.013574663798014324, "learning_rate": 0.0001, "loss": 1.9553, "loss/crossentropy": 2.704614758491516, "loss/fcd": 1.71484375, "loss/idx": 4.0, "loss/logits": 0.24047966301441193, "step": 595 }, { "epoch": 0.008899573686528942, "grad_norm": 0.65625, "grad_norm_var": 0.017493693033854167, "learning_rate": 0.0001, "loss": 1.6271, "loss/crossentropy": 2.810309410095215, "loss/fcd": 1.46875, "loss/idx": 4.0, "loss/logits": 0.1583174169063568, "step": 596 }, { "epoch": 0.008914505857143923, "grad_norm": 0.796875, "grad_norm_var": 0.018092600504557292, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.514691710472107, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.16707603633403778, "step": 597 }, { "epoch": 0.008929438027758906, "grad_norm": 0.9453125, "grad_norm_var": 0.010047403971354167, "learning_rate": 0.0001, "loss": 1.8422, "loss/crossentropy": 2.545697331428528, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.2093753144145012, "step": 598 }, { "epoch": 0.008944370198373887, "grad_norm": 0.8359375, "grad_norm_var": 0.0094696044921875, "learning_rate": 0.0001, "loss": 1.6943, "loss/crossentropy": 2.565446376800537, "loss/fcd": 1.53125, "loss/idx": 4.0, "loss/logits": 0.16300475597381592, "step": 599 }, { "epoch": 0.008959302368988868, "grad_norm": 0.8359375, "grad_norm_var": 0.009570058186848958, "learning_rate": 0.0001, "loss": 1.596, "loss/crossentropy": 2.9047205448150635, "loss/fcd": 1.4375, "loss/idx": 4.0, "loss/logits": 0.15853270143270493, "step": 600 }, { "epoch": 0.008974234539603849, "grad_norm": 0.71875, "grad_norm_var": 0.01046593983968099, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.3885122537612915, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.18485693633556366, "step": 601 }, { "epoch": 0.008989166710218831, "grad_norm": 0.78125, "grad_norm_var": 0.01083978017171224, "learning_rate": 0.0001, "loss": 1.7191, "loss/crossentropy": 2.561643123626709, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.17220281809568405, "step": 602 }, { "epoch": 0.009004098880833812, "grad_norm": 1.0, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 2.1397, "loss/crossentropy": 2.338186025619507, "loss/fcd": 1.859375, "loss/idx": 4.0, "loss/logits": 0.2802945226430893, "step": 603 }, { "epoch": 0.009019031051448793, "grad_norm": 1.0625, "grad_norm_var": 0.012873331705729166, "learning_rate": 0.0001, "loss": 1.9177, "loss/crossentropy": 2.7651796340942383, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.23406735062599182, "step": 604 }, { "epoch": 0.009033963222063776, "grad_norm": 1.1640625, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 1.7051, "loss/crossentropy": 2.51702082157135, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.17778894305229187, "step": 605 }, { "epoch": 0.009048895392678757, "grad_norm": 0.9921875, "grad_norm_var": 0.01858819325764974, "learning_rate": 0.0001, "loss": 1.8238, "loss/crossentropy": 2.4436594247817993, "loss/fcd": 1.62109375, "loss/idx": 4.0, "loss/logits": 0.20268025994300842, "step": 606 }, { "epoch": 0.009063827563293738, "grad_norm": 1.09375, "grad_norm_var": 0.020969390869140625, "learning_rate": 0.0001, "loss": 1.815, "loss/crossentropy": 2.816960573196411, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.20562273263931274, "step": 607 }, { "epoch": 0.009078759733908719, "grad_norm": 1.0390625, "grad_norm_var": 0.020823160807291668, "learning_rate": 0.0001, "loss": 1.8813, "loss/crossentropy": 2.362921714782715, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.19768549501895905, "step": 608 }, { "epoch": 0.009093691904523702, "grad_norm": 0.828125, "grad_norm_var": 0.020499610900878908, "learning_rate": 0.0001, "loss": 1.6839, "loss/crossentropy": 2.6550045013427734, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.17995422333478928, "step": 609 }, { "epoch": 0.009108624075138683, "grad_norm": 1.0703125, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 1.8508, "loss/crossentropy": 2.478885054588318, "loss/fcd": 1.6484375, "loss/idx": 4.0, "loss/logits": 0.20236501097679138, "step": 610 }, { "epoch": 0.009123556245753664, "grad_norm": 0.83203125, "grad_norm_var": 0.022239112854003908, "learning_rate": 0.0001, "loss": 1.8277, "loss/crossentropy": 2.754876732826233, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.21440355479717255, "step": 611 }, { "epoch": 0.009138488416368645, "grad_norm": 0.83203125, "grad_norm_var": 0.0180877685546875, "learning_rate": 0.0001, "loss": 1.7951, "loss/crossentropy": 2.5221598148345947, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.19747836142778397, "step": 612 }, { "epoch": 0.009153420586983627, "grad_norm": 0.828125, "grad_norm_var": 0.017607625325520834, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.666842222213745, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.1828431710600853, "step": 613 }, { "epoch": 0.009168352757598608, "grad_norm": 1.203125, "grad_norm_var": 0.022332509358723957, "learning_rate": 0.0001, "loss": 1.8873, "loss/crossentropy": 2.542251467704773, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.21547389030456543, "step": 614 }, { "epoch": 0.00918328492821359, "grad_norm": 1.0625, "grad_norm_var": 0.022251383463541666, "learning_rate": 0.0001, "loss": 1.871, "loss/crossentropy": 2.180862307548523, "loss/fcd": 1.6640625, "loss/idx": 4.0, "loss/logits": 0.2069542557001114, "step": 615 }, { "epoch": 0.009198217098828572, "grad_norm": 1.0, "grad_norm_var": 0.02124201456705729, "learning_rate": 0.0001, "loss": 1.8445, "loss/crossentropy": 2.765215754508972, "loss/fcd": 1.625, "loss/idx": 4.0, "loss/logits": 0.21946142613887787, "step": 616 }, { "epoch": 0.009213149269443553, "grad_norm": 1.125, "grad_norm_var": 0.017988840738932293, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.6672626733779907, "loss/fcd": 1.3671875, "loss/idx": 4.0, "loss/logits": 0.1371738463640213, "step": 617 }, { "epoch": 0.009228081440058534, "grad_norm": 0.84375, "grad_norm_var": 0.01645482381184896, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.4921282529830933, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.15389146655797958, "step": 618 }, { "epoch": 0.009243013610673515, "grad_norm": 0.92578125, "grad_norm_var": 0.01678460439046224, "learning_rate": 0.0001, "loss": 1.9129, "loss/crossentropy": 2.4398714303970337, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.20973220467567444, "step": 619 }, { "epoch": 0.009257945781288498, "grad_norm": 0.87109375, "grad_norm_var": 0.01732355753580729, "learning_rate": 0.0001, "loss": 1.7631, "loss/crossentropy": 2.7551677227020264, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.18502139300107956, "step": 620 }, { "epoch": 0.009272877951903478, "grad_norm": 0.95703125, "grad_norm_var": 0.01497491200764974, "learning_rate": 0.0001, "loss": 1.7992, "loss/crossentropy": 2.537253499031067, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.18983100354671478, "step": 621 }, { "epoch": 0.00928781012251846, "grad_norm": 1.4765625, "grad_norm_var": 0.031136512756347656, "learning_rate": 0.0001, "loss": 1.6154, "loss/crossentropy": 2.4758503437042236, "loss/fcd": 1.4609375, "loss/idx": 4.0, "loss/logits": 0.15446265041828156, "step": 622 }, { "epoch": 0.009302742293133442, "grad_norm": 0.890625, "grad_norm_var": 0.031156349182128906, "learning_rate": 0.0001, "loss": 1.6562, "loss/crossentropy": 2.649666428565979, "loss/fcd": 1.4921875, "loss/idx": 4.0, "loss/logits": 0.16403885930776596, "step": 623 }, { "epoch": 0.009317674463748423, "grad_norm": 1.046875, "grad_norm_var": 0.03121484120686849, "learning_rate": 0.0001, "loss": 1.6746, "loss/crossentropy": 2.6181329488754272, "loss/fcd": 1.5078125, "loss/idx": 4.0, "loss/logits": 0.1668335720896721, "step": 624 }, { "epoch": 0.009332606634363404, "grad_norm": 1.3671875, "grad_norm_var": 0.037953122456868486, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.5993008613586426, "loss/fcd": 1.921875, "loss/idx": 4.0, "loss/logits": 0.22541476786136627, "step": 625 }, { "epoch": 0.009347538804978385, "grad_norm": 1.2109375, "grad_norm_var": 0.04011834462483724, "learning_rate": 0.0001, "loss": 1.7797, "loss/crossentropy": 2.519839644432068, "loss/fcd": 1.58984375, "loss/idx": 4.0, "loss/logits": 0.18983253091573715, "step": 626 }, { "epoch": 0.009362470975593368, "grad_norm": 0.9140625, "grad_norm_var": 0.03837865193684896, "learning_rate": 0.0001, "loss": 1.7139, "loss/crossentropy": 2.5223900079727173, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.17874927818775177, "step": 627 }, { "epoch": 0.009377403146208349, "grad_norm": 1.6640625, "grad_norm_var": 0.05916589101155599, "learning_rate": 0.0001, "loss": 1.872, "loss/crossentropy": 2.6015422344207764, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.18056581169366837, "step": 628 }, { "epoch": 0.00939233531682333, "grad_norm": 0.9921875, "grad_norm_var": 0.05519250233968099, "learning_rate": 0.0001, "loss": 1.7462, "loss/crossentropy": 2.525193214416504, "loss/fcd": 1.56640625, "loss/idx": 4.0, "loss/logits": 0.1797695904970169, "step": 629 }, { "epoch": 0.00940726748743831, "grad_norm": 0.86328125, "grad_norm_var": 0.05759862263997396, "learning_rate": 0.0001, "loss": 1.7776, "loss/crossentropy": 2.4848132133483887, "loss/fcd": 1.58203125, "loss/idx": 4.0, "loss/logits": 0.19558971375226974, "step": 630 }, { "epoch": 0.009422199658053293, "grad_norm": 1.15625, "grad_norm_var": 0.057983144124348955, "learning_rate": 0.0001, "loss": 2.0806, "loss/crossentropy": 2.5745344161987305, "loss/fcd": 1.83203125, "loss/idx": 4.0, "loss/logits": 0.24858521670103073, "step": 631 }, { "epoch": 0.009437131828668274, "grad_norm": 0.8359375, "grad_norm_var": 0.06144917805989583, "learning_rate": 0.0001, "loss": 1.7292, "loss/crossentropy": 2.63934063911438, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.1784433200955391, "step": 632 }, { "epoch": 0.009452063999283255, "grad_norm": 0.8671875, "grad_norm_var": 0.06375706990559896, "learning_rate": 0.0001, "loss": 1.5677, "loss/crossentropy": 2.8568373918533325, "loss/fcd": 1.421875, "loss/idx": 4.0, "loss/logits": 0.14580092206597328, "step": 633 }, { "epoch": 0.009466996169898238, "grad_norm": 0.8203125, "grad_norm_var": 0.06445210774739583, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.530740976333618, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.17399245500564575, "step": 634 }, { "epoch": 0.009481928340513219, "grad_norm": 0.9375, "grad_norm_var": 0.06426080067952473, "learning_rate": 0.0001, "loss": 1.6844, "loss/crossentropy": 2.536529541015625, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.17269806563854218, "step": 635 }, { "epoch": 0.0094968605111282, "grad_norm": 0.890625, "grad_norm_var": 0.06380716959635417, "learning_rate": 0.0001, "loss": 1.792, "loss/crossentropy": 2.396178364753723, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.19045118242502213, "step": 636 }, { "epoch": 0.00951179268174318, "grad_norm": 0.953125, "grad_norm_var": 0.063859494527181, "learning_rate": 0.0001, "loss": 1.9673, "loss/crossentropy": 2.3137094974517822, "loss/fcd": 1.734375, "loss/idx": 4.0, "loss/logits": 0.23295484483242035, "step": 637 }, { "epoch": 0.009526724852358164, "grad_norm": 0.8515625, "grad_norm_var": 0.05317834218343099, "learning_rate": 0.0001, "loss": 1.7548, "loss/crossentropy": 2.7743014097213745, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.18447993695735931, "step": 638 }, { "epoch": 0.009541657022973144, "grad_norm": 0.87890625, "grad_norm_var": 0.05338338216145833, "learning_rate": 0.0001, "loss": 1.7626, "loss/crossentropy": 2.4832268953323364, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.20006201416254044, "step": 639 }, { "epoch": 0.009556589193588125, "grad_norm": 0.9375, "grad_norm_var": 0.053675333658854164, "learning_rate": 0.0001, "loss": 1.7401, "loss/crossentropy": 2.3719040155410767, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18149051070213318, "step": 640 }, { "epoch": 0.009571521364203108, "grad_norm": 0.88671875, "grad_norm_var": 0.04514357248942057, "learning_rate": 0.0001, "loss": 1.8719, "loss/crossentropy": 2.3773958683013916, "loss/fcd": 1.67578125, "loss/idx": 4.0, "loss/logits": 0.19614291191101074, "step": 641 }, { "epoch": 0.009586453534818089, "grad_norm": 0.93359375, "grad_norm_var": 0.04136530558268229, "learning_rate": 0.0001, "loss": 1.7711, "loss/crossentropy": 2.8984084129333496, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.20078962296247482, "step": 642 }, { "epoch": 0.00960138570543307, "grad_norm": 0.75390625, "grad_norm_var": 0.043979835510253903, "learning_rate": 0.0001, "loss": 1.7898, "loss/crossentropy": 2.430532932281494, "loss/fcd": 1.58984375, "loss/idx": 4.0, "loss/logits": 0.19993210583925247, "step": 643 }, { "epoch": 0.009616317876048051, "grad_norm": 1.0625, "grad_norm_var": 0.009436988830566406, "learning_rate": 0.0001, "loss": 1.6162, "loss/crossentropy": 2.6839109659194946, "loss/fcd": 1.45703125, "loss/idx": 4.0, "loss/logits": 0.15920980274677277, "step": 644 }, { "epoch": 0.009631250046663034, "grad_norm": 0.765625, "grad_norm_var": 0.010277748107910156, "learning_rate": 0.0001, "loss": 1.5942, "loss/crossentropy": 2.341609477996826, "loss/fcd": 1.44140625, "loss/idx": 4.0, "loss/logits": 0.15275691449642181, "step": 645 }, { "epoch": 0.009646182217278015, "grad_norm": 1.015625, "grad_norm_var": 0.0109893798828125, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.766583561897278, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.174458347260952, "step": 646 }, { "epoch": 0.009661114387892996, "grad_norm": 0.9921875, "grad_norm_var": 0.007266998291015625, "learning_rate": 0.0001, "loss": 1.8037, "loss/crossentropy": 2.591106653213501, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.19436196237802505, "step": 647 }, { "epoch": 0.009676046558507978, "grad_norm": 0.78515625, "grad_norm_var": 0.007854652404785157, "learning_rate": 0.0001, "loss": 1.8771, "loss/crossentropy": 2.5671372413635254, "loss/fcd": 1.66015625, "loss/idx": 4.0, "loss/logits": 0.2169155329465866, "step": 648 }, { "epoch": 0.00969097872912296, "grad_norm": 0.91015625, "grad_norm_var": 0.007806396484375, "learning_rate": 0.0001, "loss": 1.9598, "loss/crossentropy": 2.369256615638733, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.2566370368003845, "step": 649 }, { "epoch": 0.00970591089973794, "grad_norm": 1.0234375, "grad_norm_var": 0.008269246419270833, "learning_rate": 0.0001, "loss": 1.7494, "loss/crossentropy": 2.582263708114624, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.1869092434644699, "step": 650 }, { "epoch": 0.009720843070352921, "grad_norm": 0.8125, "grad_norm_var": 0.008806355794270833, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.713529109954834, "loss/fcd": 1.39453125, "loss/idx": 4.0, "loss/logits": 0.1423557996749878, "step": 651 }, { "epoch": 0.009735775240967904, "grad_norm": 0.84375, "grad_norm_var": 0.009023030598958334, "learning_rate": 0.0001, "loss": 1.7421, "loss/crossentropy": 2.555392622947693, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18351471424102783, "step": 652 }, { "epoch": 0.009750707411582885, "grad_norm": 0.921875, "grad_norm_var": 0.008864339192708333, "learning_rate": 0.0001, "loss": 1.6934, "loss/crossentropy": 2.4481018781661987, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.1817130595445633, "step": 653 }, { "epoch": 0.009765639582197866, "grad_norm": 0.8671875, "grad_norm_var": 0.008781941731770833, "learning_rate": 0.0001, "loss": 1.7043, "loss/crossentropy": 2.630600929260254, "loss/fcd": 1.53125, "loss/idx": 4.0, "loss/logits": 0.17303074896335602, "step": 654 }, { "epoch": 0.009780571752812847, "grad_norm": 0.9609375, "grad_norm_var": 0.00897820790608724, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.6150777339935303, "loss/fcd": 1.41015625, "loss/idx": 4.0, "loss/logits": 0.15385961532592773, "step": 655 }, { "epoch": 0.00979550392342783, "grad_norm": 0.97265625, "grad_norm_var": 0.009209950764973959, "learning_rate": 0.0001, "loss": 1.6253, "loss/crossentropy": 2.5774786472320557, "loss/fcd": 1.46875, "loss/idx": 4.0, "loss/logits": 0.15658701956272125, "step": 656 }, { "epoch": 0.00981043609404281, "grad_norm": 0.76953125, "grad_norm_var": 0.010381062825520834, "learning_rate": 0.0001, "loss": 1.7633, "loss/crossentropy": 2.5385220050811768, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.18515019118785858, "step": 657 }, { "epoch": 0.009825368264657791, "grad_norm": 0.71484375, "grad_norm_var": 0.0123748779296875, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.4262478351593018, "loss/fcd": 1.3828125, "loss/idx": 4.0, "loss/logits": 0.14915584027767181, "step": 658 }, { "epoch": 0.009840300435272774, "grad_norm": 0.81640625, "grad_norm_var": 0.0115203857421875, "learning_rate": 0.0001, "loss": 1.7338, "loss/crossentropy": 2.755561113357544, "loss/fcd": 1.5390625, "loss/idx": 4.0, "loss/logits": 0.19469743967056274, "step": 659 }, { "epoch": 0.009855232605887755, "grad_norm": 1.0859375, "grad_norm_var": 0.012094879150390625, "learning_rate": 0.0001, "loss": 2.1387, "loss/crossentropy": 2.3337032794952393, "loss/fcd": 1.85546875, "loss/idx": 4.0, "loss/logits": 0.28325171023607254, "step": 660 }, { "epoch": 0.009870164776502736, "grad_norm": 0.75390625, "grad_norm_var": 0.012299537658691406, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.4794669151306152, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.1632954254746437, "step": 661 }, { "epoch": 0.009885096947117717, "grad_norm": 0.97265625, "grad_norm_var": 0.0116973876953125, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.642146348953247, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.18031089007854462, "step": 662 }, { "epoch": 0.0099000291177327, "grad_norm": 0.8359375, "grad_norm_var": 0.011046346028645833, "learning_rate": 0.0001, "loss": 1.8485, "loss/crossentropy": 2.4629205465316772, "loss/fcd": 1.65234375, "loss/idx": 4.0, "loss/logits": 0.19618399441242218, "step": 663 }, { "epoch": 0.00991496128834768, "grad_norm": 0.9296875, "grad_norm_var": 0.010564104715983073, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.5848491191864014, "loss/fcd": 1.453125, "loss/idx": 4.0, "loss/logits": 0.15756294131278992, "step": 664 }, { "epoch": 0.009929893458962662, "grad_norm": 0.90234375, "grad_norm_var": 0.010543759663899739, "learning_rate": 0.0001, "loss": 1.6383, "loss/crossentropy": 2.578953504562378, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.15785705298185349, "step": 665 }, { "epoch": 0.009944825629577644, "grad_norm": 0.86328125, "grad_norm_var": 0.009222157796223958, "learning_rate": 0.0001, "loss": 1.7285, "loss/crossentropy": 2.6012935638427734, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.16986890882253647, "step": 666 }, { "epoch": 0.009959757800192625, "grad_norm": 0.82421875, "grad_norm_var": 0.00913079579671224, "learning_rate": 0.0001, "loss": 1.675, "loss/crossentropy": 2.577793598175049, "loss/fcd": 1.4921875, "loss/idx": 4.0, "loss/logits": 0.18284159153699875, "step": 667 }, { "epoch": 0.009974689970807606, "grad_norm": 0.8671875, "grad_norm_var": 0.00906060536702474, "learning_rate": 0.0001, "loss": 1.6981, "loss/crossentropy": 2.6089415550231934, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.17078907787799835, "step": 668 }, { "epoch": 0.009989622141422587, "grad_norm": 0.953125, "grad_norm_var": 0.009301694234212239, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.7468433380126953, "loss/fcd": 1.5390625, "loss/idx": 4.0, "loss/logits": 0.17840731143951416, "step": 669 }, { "epoch": 0.01000455431203757, "grad_norm": 0.84765625, "grad_norm_var": 0.009360504150390626, "learning_rate": 0.0001, "loss": 1.8898, "loss/crossentropy": 2.278976559638977, "loss/fcd": 1.6875, "loss/idx": 4.0, "loss/logits": 0.2022818624973297, "step": 670 }, { "epoch": 0.010019486482652551, "grad_norm": 0.890625, "grad_norm_var": 0.008905029296875, "learning_rate": 0.0001, "loss": 1.8042, "loss/crossentropy": 2.7631266117095947, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.194790318608284, "step": 671 }, { "epoch": 0.010034418653267532, "grad_norm": 0.78125, "grad_norm_var": 0.008702532450358073, "learning_rate": 0.0001, "loss": 1.6957, "loss/crossentropy": 2.5823254585266113, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.1801098957657814, "step": 672 }, { "epoch": 0.010049350823882513, "grad_norm": 0.765625, "grad_norm_var": 0.008752187093098959, "learning_rate": 0.0001, "loss": 1.7467, "loss/crossentropy": 2.3960039615631104, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.19199489802122116, "step": 673 }, { "epoch": 0.010064282994497496, "grad_norm": 0.91015625, "grad_norm_var": 0.007283528645833333, "learning_rate": 0.0001, "loss": 1.8449, "loss/crossentropy": 2.629265785217285, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.21602004766464233, "step": 674 }, { "epoch": 0.010079215165112477, "grad_norm": 0.63671875, "grad_norm_var": 0.010705312093098959, "learning_rate": 0.0001, "loss": 1.6236, "loss/crossentropy": 2.2417469024658203, "loss/fcd": 1.46484375, "loss/idx": 4.0, "loss/logits": 0.15875373780727386, "step": 675 }, { "epoch": 0.010094147335727457, "grad_norm": 0.86328125, "grad_norm_var": 0.007208188374837239, "learning_rate": 0.0001, "loss": 1.6652, "loss/crossentropy": 2.4856256246566772, "loss/fcd": 1.484375, "loss/idx": 4.0, "loss/logits": 0.1808222532272339, "step": 676 }, { "epoch": 0.01010907950634244, "grad_norm": 0.7890625, "grad_norm_var": 0.0068356831868489586, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.6791006326675415, "loss/fcd": 1.47265625, "loss/idx": 4.0, "loss/logits": 0.1792703941464424, "step": 677 }, { "epoch": 0.010124011676957421, "grad_norm": 1.0078125, "grad_norm_var": 0.00747826894124349, "learning_rate": 0.0001, "loss": 1.8774, "loss/crossentropy": 2.4157909154891968, "loss/fcd": 1.66796875, "loss/idx": 4.0, "loss/logits": 0.20945309102535248, "step": 678 }, { "epoch": 0.010138943847572402, "grad_norm": 0.98828125, "grad_norm_var": 0.008556874593098958, "learning_rate": 0.0001, "loss": 1.7904, "loss/crossentropy": 2.5772135257720947, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.188814677298069, "step": 679 }, { "epoch": 0.010153876018187383, "grad_norm": 1.125, "grad_norm_var": 0.012657674153645833, "learning_rate": 0.0001, "loss": 1.6447, "loss/crossentropy": 2.690971612930298, "loss/fcd": 1.4765625, "loss/idx": 4.0, "loss/logits": 0.16817960143089294, "step": 680 }, { "epoch": 0.010168808188802366, "grad_norm": 0.87890625, "grad_norm_var": 0.012609608968098958, "learning_rate": 0.0001, "loss": 1.7396, "loss/crossentropy": 2.602474093437195, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.1888354793190956, "step": 681 }, { "epoch": 0.010183740359417347, "grad_norm": 0.78515625, "grad_norm_var": 0.013108062744140624, "learning_rate": 0.0001, "loss": 1.5378, "loss/crossentropy": 2.501987338066101, "loss/fcd": 1.3984375, "loss/idx": 4.0, "loss/logits": 0.13934527337551117, "step": 682 }, { "epoch": 0.010198672530032328, "grad_norm": 0.90234375, "grad_norm_var": 0.013016510009765624, "learning_rate": 0.0001, "loss": 1.6797, "loss/crossentropy": 2.6341532468795776, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.1679832711815834, "step": 683 }, { "epoch": 0.01021360470064731, "grad_norm": 0.7421875, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.5898005962371826, "loss/fcd": 1.3671875, "loss/idx": 4.0, "loss/logits": 0.14340700209140778, "step": 684 }, { "epoch": 0.010228536871262291, "grad_norm": 0.953125, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 1.7084, "loss/crossentropy": 2.754904866218567, "loss/fcd": 1.5234375, "loss/idx": 4.0, "loss/logits": 0.18500886112451553, "step": 685 }, { "epoch": 0.010243469041877272, "grad_norm": 0.87109375, "grad_norm_var": 0.0140899658203125, "learning_rate": 0.0001, "loss": 1.8091, "loss/crossentropy": 2.4006221294403076, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.19577862322330475, "step": 686 }, { "epoch": 0.010258401212492253, "grad_norm": 1.3125, "grad_norm_var": 0.02647705078125, "learning_rate": 0.0001, "loss": 1.8188, "loss/crossentropy": 2.374703049659729, "loss/fcd": 1.6171875, "loss/idx": 4.0, "loss/logits": 0.20162508636713028, "step": 687 }, { "epoch": 0.010273333383107236, "grad_norm": 0.87890625, "grad_norm_var": 0.02559808095296224, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.316514492034912, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.17492541670799255, "step": 688 }, { "epoch": 0.010288265553722217, "grad_norm": 0.69140625, "grad_norm_var": 0.027278391520182292, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.468592405319214, "loss/fcd": 1.453125, "loss/idx": 4.0, "loss/logits": 0.15006089210510254, "step": 689 }, { "epoch": 0.010303197724337198, "grad_norm": 0.81640625, "grad_norm_var": 0.02765070597330729, "learning_rate": 0.0001, "loss": 1.6631, "loss/crossentropy": 2.5445820093154907, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.16699903458356857, "step": 690 }, { "epoch": 0.010318129894952179, "grad_norm": 0.56640625, "grad_norm_var": 0.030335489908854166, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.490410089492798, "loss/fcd": 1.46484375, "loss/idx": 4.5, "loss/logits": 0.1606893539428711, "step": 691 }, { "epoch": 0.010333062065567162, "grad_norm": 0.7265625, "grad_norm_var": 0.03191318511962891, "learning_rate": 0.0001, "loss": 1.855, "loss/crossentropy": 2.340074300765991, "loss/fcd": 1.640625, "loss/idx": 4.5, "loss/logits": 0.21434715390205383, "step": 692 }, { "epoch": 0.010347994236182143, "grad_norm": 0.62890625, "grad_norm_var": 0.03539835611979167, "learning_rate": 0.0001, "loss": 1.8011, "loss/crossentropy": 2.5316191911697388, "loss/fcd": 1.59375, "loss/idx": 4.5, "loss/logits": 0.20732642710208893, "step": 693 }, { "epoch": 0.010362926406797123, "grad_norm": 0.54296875, "grad_norm_var": 0.04018751780192057, "learning_rate": 0.0001, "loss": 1.5777, "loss/crossentropy": 2.5405019521713257, "loss/fcd": 1.40234375, "loss/idx": 4.5, "loss/logits": 0.17533162981271744, "step": 694 }, { "epoch": 0.010377858577412106, "grad_norm": 0.5859375, "grad_norm_var": 0.04225031534830729, "learning_rate": 0.0001, "loss": 1.5453, "loss/crossentropy": 2.779569625854492, "loss/fcd": 1.37109375, "loss/idx": 4.5, "loss/logits": 0.17417190968990326, "step": 695 }, { "epoch": 0.010392790748027087, "grad_norm": 0.65625, "grad_norm_var": 0.03648249308268229, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.5351744890213013, "loss/fcd": 1.41015625, "loss/idx": 4.5, "loss/logits": 0.15832393616437912, "step": 696 }, { "epoch": 0.010407722918642068, "grad_norm": 0.73046875, "grad_norm_var": 0.035975138346354164, "learning_rate": 0.0001, "loss": 1.6724, "loss/crossentropy": 2.2647390365600586, "loss/fcd": 1.484375, "loss/idx": 4.5, "loss/logits": 0.1879829317331314, "step": 697 }, { "epoch": 0.010422655089257049, "grad_norm": 0.73828125, "grad_norm_var": 0.03604532877604167, "learning_rate": 0.0001, "loss": 1.8554, "loss/crossentropy": 2.6300876140594482, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.25777651369571686, "step": 698 }, { "epoch": 0.010437587259872032, "grad_norm": 0.60546875, "grad_norm_var": 0.0363739013671875, "learning_rate": 0.0001, "loss": 1.5986, "loss/crossentropy": 2.5962857007980347, "loss/fcd": 1.42578125, "loss/idx": 4.5, "loss/logits": 0.17285378277301788, "step": 699 }, { "epoch": 0.010452519430487013, "grad_norm": 0.5859375, "grad_norm_var": 0.03812357584635417, "learning_rate": 0.0001, "loss": 1.6397, "loss/crossentropy": 2.7463815212249756, "loss/fcd": 1.44921875, "loss/idx": 4.5, "loss/logits": 0.19048676639795303, "step": 700 }, { "epoch": 0.010467451601101994, "grad_norm": 0.67578125, "grad_norm_var": 0.03516686757405599, "learning_rate": 0.0001, "loss": 1.6874, "loss/crossentropy": 2.5035320520401, "loss/fcd": 1.5, "loss/idx": 4.5, "loss/logits": 0.18743757903575897, "step": 701 }, { "epoch": 0.010482383771716976, "grad_norm": 0.7109375, "grad_norm_var": 0.03366800944010417, "learning_rate": 0.0001, "loss": 1.5613, "loss/crossentropy": 2.606451153755188, "loss/fcd": 1.39453125, "loss/idx": 4.5, "loss/logits": 0.16672632098197937, "step": 702 }, { "epoch": 0.010497315942331957, "grad_norm": 0.5234375, "grad_norm_var": 0.009806060791015625, "learning_rate": 0.0001, "loss": 1.6531, "loss/crossentropy": 2.852249503135681, "loss/fcd": 1.453125, "loss/idx": 4.5, "loss/logits": 0.20001086592674255, "step": 703 }, { "epoch": 0.010512248112946938, "grad_norm": 0.6796875, "grad_norm_var": 0.006644630432128906, "learning_rate": 0.0001, "loss": 1.5716, "loss/crossentropy": 2.573358654975891, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16530360281467438, "step": 704 }, { "epoch": 0.01052718028356192, "grad_norm": 0.609375, "grad_norm_var": 0.006656646728515625, "learning_rate": 0.0001, "loss": 1.8231, "loss/crossentropy": 2.376923680305481, "loss/fcd": 1.609375, "loss/idx": 4.5, "loss/logits": 0.2137339860200882, "step": 705 }, { "epoch": 0.010542112454176902, "grad_norm": 0.515625, "grad_norm_var": 0.005594317118326823, "learning_rate": 0.0001, "loss": 1.5644, "loss/crossentropy": 2.464131474494934, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.18554290384054184, "step": 706 }, { "epoch": 0.010557044624791883, "grad_norm": 0.5625, "grad_norm_var": 0.005628458658854167, "learning_rate": 0.0001, "loss": 1.6984, "loss/crossentropy": 2.2634644508361816, "loss/fcd": 1.5234375, "loss/idx": 4.5, "loss/logits": 0.1750023141503334, "step": 707 }, { "epoch": 0.010571976795406864, "grad_norm": 0.6328125, "grad_norm_var": 0.004969278971354167, "learning_rate": 0.0001, "loss": 1.7061, "loss/crossentropy": 2.3716477751731873, "loss/fcd": 1.4921875, "loss/idx": 4.5, "loss/logits": 0.21387499570846558, "step": 708 }, { "epoch": 0.010586908966021847, "grad_norm": 0.765625, "grad_norm_var": 0.006226539611816406, "learning_rate": 0.0001, "loss": 1.5934, "loss/crossentropy": 2.4680949449539185, "loss/fcd": 1.4296875, "loss/idx": 4.5, "loss/logits": 0.163712278008461, "step": 709 }, { "epoch": 0.010601841136636828, "grad_norm": 1.0546875, "grad_norm_var": 0.01647923787434896, "learning_rate": 0.0001, "loss": 1.9395, "loss/crossentropy": 2.2741931676864624, "loss/fcd": 1.6953125, "loss/idx": 4.5, "loss/logits": 0.24415750801563263, "step": 710 }, { "epoch": 0.010616773307251809, "grad_norm": 0.52734375, "grad_norm_var": 0.01730798085530599, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.863884925842285, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.1737941950559616, "step": 711 }, { "epoch": 0.01063170547786679, "grad_norm": 0.46484375, "grad_norm_var": 0.01971613566080729, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.589643120765686, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.16527117788791656, "step": 712 }, { "epoch": 0.010646637648481772, "grad_norm": 0.9921875, "grad_norm_var": 0.026842689514160155, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.3513693809509277, "loss/fcd": 1.56640625, "loss/idx": 4.5, "loss/logits": 0.1720396801829338, "step": 713 }, { "epoch": 0.010661569819096753, "grad_norm": 0.59765625, "grad_norm_var": 0.026709938049316408, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.5104587078094482, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.15896443277597427, "step": 714 }, { "epoch": 0.010676501989711734, "grad_norm": 0.78125, "grad_norm_var": 0.027445220947265626, "learning_rate": 0.0001, "loss": 1.635, "loss/crossentropy": 2.6453863382339478, "loss/fcd": 1.4453125, "loss/idx": 4.5, "loss/logits": 0.18970200419425964, "step": 715 }, { "epoch": 0.010691434160326715, "grad_norm": 0.68359375, "grad_norm_var": 0.026979509989420572, "learning_rate": 0.0001, "loss": 1.7556, "loss/crossentropy": 2.2782176733016968, "loss/fcd": 1.5625, "loss/idx": 4.5, "loss/logits": 0.19310269504785538, "step": 716 }, { "epoch": 0.010706366330941698, "grad_norm": 0.89453125, "grad_norm_var": 0.030034319559733073, "learning_rate": 0.0001, "loss": 1.9091, "loss/crossentropy": 2.518513798713684, "loss/fcd": 1.64453125, "loss/idx": 4.5, "loss/logits": 0.2645469158887863, "step": 717 }, { "epoch": 0.010721298501556679, "grad_norm": 0.62109375, "grad_norm_var": 0.030255126953125, "learning_rate": 0.0001, "loss": 1.6619, "loss/crossentropy": 2.3726186752319336, "loss/fcd": 1.48046875, "loss/idx": 4.5, "loss/logits": 0.18143515288829803, "step": 718 }, { "epoch": 0.01073623067217166, "grad_norm": 0.71875, "grad_norm_var": 0.028519439697265624, "learning_rate": 0.0001, "loss": 1.8026, "loss/crossentropy": 2.561523675918579, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.20491648465394974, "step": 719 }, { "epoch": 0.010751162842786642, "grad_norm": 0.68359375, "grad_norm_var": 0.028513018290201822, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.5928921699523926, "loss/fcd": 1.33203125, "loss/idx": 4.5, "loss/logits": 0.15430676937103271, "step": 720 }, { "epoch": 0.010766095013401623, "grad_norm": 0.69140625, "grad_norm_var": 0.028006998697916667, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.8197734355926514, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.18625715374946594, "step": 721 }, { "epoch": 0.010781027184016604, "grad_norm": 0.59375, "grad_norm_var": 0.026476033528645835, "learning_rate": 0.0001, "loss": 1.6078, "loss/crossentropy": 2.76321017742157, "loss/fcd": 1.41796875, "loss/idx": 4.5, "loss/logits": 0.18983574956655502, "step": 722 }, { "epoch": 0.010795959354631585, "grad_norm": 0.5859375, "grad_norm_var": 0.02606786092122396, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.5983407497406006, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.17812784761190414, "step": 723 }, { "epoch": 0.010810891525246568, "grad_norm": 0.69140625, "grad_norm_var": 0.02571404774983724, "learning_rate": 0.0001, "loss": 1.7445, "loss/crossentropy": 2.4067989587783813, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.18979395180940628, "step": 724 }, { "epoch": 0.010825823695861549, "grad_norm": 0.54296875, "grad_norm_var": 0.027138264973958333, "learning_rate": 0.0001, "loss": 1.7346, "loss/crossentropy": 2.627017021179199, "loss/fcd": 1.50390625, "loss/idx": 4.5, "loss/logits": 0.23074181377887726, "step": 725 }, { "epoch": 0.01084075586647653, "grad_norm": 0.52734375, "grad_norm_var": 0.01925042470296224, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.517759680747986, "loss/fcd": 1.33203125, "loss/idx": 4.5, "loss/logits": 0.15450593829154968, "step": 726 }, { "epoch": 0.010855688037091513, "grad_norm": 0.71484375, "grad_norm_var": 0.01807244618733724, "learning_rate": 0.0001, "loss": 1.8211, "loss/crossentropy": 2.503226161003113, "loss/fcd": 1.59375, "loss/idx": 4.5, "loss/logits": 0.22732951492071152, "step": 727 }, { "epoch": 0.010870620207706494, "grad_norm": 0.5390625, "grad_norm_var": 0.016346232096354166, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.5677419900894165, "loss/fcd": 1.4140625, "loss/idx": 4.5, "loss/logits": 0.17830483615398407, "step": 728 }, { "epoch": 0.010885552378321475, "grad_norm": 0.609375, "grad_norm_var": 0.009504954020182291, "learning_rate": 0.0001, "loss": 1.6721, "loss/crossentropy": 2.6440834999084473, "loss/fcd": 1.47265625, "loss/idx": 4.5, "loss/logits": 0.19942344725131989, "step": 729 }, { "epoch": 0.010900484548936456, "grad_norm": 0.53125, "grad_norm_var": 0.010286394755045574, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.8209099769592285, "loss/fcd": 1.34765625, "loss/idx": 4.5, "loss/logits": 0.16875500977039337, "step": 730 }, { "epoch": 0.010915416719551438, "grad_norm": 1.4765625, "grad_norm_var": 0.05261173248291016, "learning_rate": 0.0001, "loss": 1.7781, "loss/crossentropy": 2.5496203899383545, "loss/fcd": 1.54296875, "loss/idx": 4.5, "loss/logits": 0.2351158782839775, "step": 731 }, { "epoch": 0.01093034889016642, "grad_norm": 0.7265625, "grad_norm_var": 0.05266698201497396, "learning_rate": 0.0001, "loss": 1.834, "loss/crossentropy": 2.629123568534851, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.23638265579938889, "step": 732 }, { "epoch": 0.0109452810607814, "grad_norm": 0.63671875, "grad_norm_var": 0.050023396809895836, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.464652180671692, "loss/fcd": 1.46875, "loss/idx": 4.5, "loss/logits": 0.18121907860040665, "step": 733 }, { "epoch": 0.010960213231396381, "grad_norm": 0.984375, "grad_norm_var": 0.055386288960774736, "learning_rate": 0.0001, "loss": 1.8003, "loss/crossentropy": 2.5960421562194824, "loss/fcd": 1.58203125, "loss/idx": 4.5, "loss/logits": 0.2182973176240921, "step": 734 }, { "epoch": 0.010975145402011364, "grad_norm": 0.6015625, "grad_norm_var": 0.056004269917805986, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.6835473775863647, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16806279867887497, "step": 735 }, { "epoch": 0.010990077572626345, "grad_norm": 0.671875, "grad_norm_var": 0.05603230794270833, "learning_rate": 0.0001, "loss": 1.6808, "loss/crossentropy": 2.8178036212921143, "loss/fcd": 1.48828125, "loss/idx": 4.5, "loss/logits": 0.19247954338788986, "step": 736 }, { "epoch": 0.011005009743241326, "grad_norm": 0.5390625, "grad_norm_var": 0.05756219228108724, "learning_rate": 0.0001, "loss": 1.5277, "loss/crossentropy": 2.874936580657959, "loss/fcd": 1.3515625, "loss/idx": 4.5, "loss/logits": 0.17609652876853943, "step": 737 }, { "epoch": 0.011019941913856308, "grad_norm": 0.56640625, "grad_norm_var": 0.05794448852539062, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.6819831132888794, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.15826641023159027, "step": 738 }, { "epoch": 0.01103487408447129, "grad_norm": 0.65625, "grad_norm_var": 0.0573333740234375, "learning_rate": 0.0001, "loss": 1.6524, "loss/crossentropy": 2.48427951335907, "loss/fcd": 1.47265625, "loss/idx": 4.5, "loss/logits": 0.17978182435035706, "step": 739 }, { "epoch": 0.01104980625508627, "grad_norm": 0.765625, "grad_norm_var": 0.05770664215087891, "learning_rate": 0.0001, "loss": 1.6315, "loss/crossentropy": 2.6147966384887695, "loss/fcd": 1.46484375, "loss/idx": 4.5, "loss/logits": 0.16669780015945435, "step": 740 }, { "epoch": 0.011064738425701251, "grad_norm": 1.7421875, "grad_norm_var": 0.12358169555664063, "learning_rate": 0.0001, "loss": 1.9483, "loss/crossentropy": 2.789383292198181, "loss/fcd": 1.7265625, "loss/idx": 4.5, "loss/logits": 0.22178317606449127, "step": 741 }, { "epoch": 0.011079670596316234, "grad_norm": 0.59765625, "grad_norm_var": 0.1216339111328125, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.650735855102539, "loss/fcd": 1.48828125, "loss/idx": 4.5, "loss/logits": 0.17772582918405533, "step": 742 }, { "epoch": 0.011094602766931215, "grad_norm": 0.7109375, "grad_norm_var": 0.12166487375895182, "learning_rate": 0.0001, "loss": 1.4375, "loss/crossentropy": 2.542524218559265, "loss/fcd": 1.29296875, "loss/idx": 4.5, "loss/logits": 0.1445121243596077, "step": 743 }, { "epoch": 0.011109534937546196, "grad_norm": 0.455078125, "grad_norm_var": 0.12471655209859213, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.5641543865203857, "loss/fcd": 1.28515625, "loss/idx": 4.5, "loss/logits": 0.1506011188030243, "step": 744 }, { "epoch": 0.011124467108161179, "grad_norm": 0.8203125, "grad_norm_var": 0.12306516965230306, "learning_rate": 0.0001, "loss": 1.8413, "loss/crossentropy": 2.5019315481185913, "loss/fcd": 1.62890625, "loss/idx": 4.5, "loss/logits": 0.2123931124806404, "step": 745 }, { "epoch": 0.01113939927877616, "grad_norm": 1.34375, "grad_norm_var": 0.13736062049865722, "learning_rate": 0.0001, "loss": 1.845, "loss/crossentropy": 2.5026661157608032, "loss/fcd": 1.63671875, "loss/idx": 4.5, "loss/logits": 0.20829641073942184, "step": 746 }, { "epoch": 0.01115433144939114, "grad_norm": 0.67578125, "grad_norm_var": 0.10850434303283692, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.7267106771469116, "loss/fcd": 1.3203125, "loss/idx": 4.5, "loss/logits": 0.14988812804222107, "step": 747 }, { "epoch": 0.011169263620006122, "grad_norm": 0.6640625, "grad_norm_var": 0.10920116106669107, "learning_rate": 0.0001, "loss": 1.5848, "loss/crossentropy": 2.7606505155563354, "loss/fcd": 1.40234375, "loss/idx": 4.5, "loss/logits": 0.18249479681253433, "step": 748 }, { "epoch": 0.011184195790621104, "grad_norm": 0.6875, "grad_norm_var": 0.1084126631418864, "learning_rate": 0.0001, "loss": 1.6927, "loss/crossentropy": 2.5876625776290894, "loss/fcd": 1.50390625, "loss/idx": 4.5, "loss/logits": 0.1888418346643448, "step": 749 }, { "epoch": 0.011199127961236085, "grad_norm": 0.578125, "grad_norm_var": 0.10766549110412597, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.8532919883728027, "loss/fcd": 1.421875, "loss/idx": 4.5, "loss/logits": 0.1953836902976036, "step": 750 }, { "epoch": 0.011214060131851066, "grad_norm": 0.6796875, "grad_norm_var": 0.10645114580790202, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.7735183238983154, "loss/fcd": 1.51953125, "loss/idx": 4.5, "loss/logits": 0.18861962109804153, "step": 751 }, { "epoch": 0.011228992302466047, "grad_norm": 0.92578125, "grad_norm_var": 0.10750908851623535, "learning_rate": 0.0001, "loss": 2.2305, "loss/crossentropy": 2.238771915435791, "loss/fcd": 1.8984375, "loss/idx": 4.5, "loss/logits": 0.3320219963788986, "step": 752 }, { "epoch": 0.01124392447308103, "grad_norm": 0.6328125, "grad_norm_var": 0.1051027774810791, "learning_rate": 0.0001, "loss": 1.5859, "loss/crossentropy": 2.843374013900757, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.17964357882738113, "step": 753 }, { "epoch": 0.01125885664369601, "grad_norm": 0.5625, "grad_norm_var": 0.1052156925201416, "learning_rate": 0.0001, "loss": 1.6273, "loss/crossentropy": 2.5842140913009644, "loss/fcd": 1.43359375, "loss/idx": 4.5, "loss/logits": 0.1937049850821495, "step": 754 }, { "epoch": 0.011273788814310992, "grad_norm": 0.5078125, "grad_norm_var": 0.10906434059143066, "learning_rate": 0.0001, "loss": 1.6215, "loss/crossentropy": 2.3504830598831177, "loss/fcd": 1.44921875, "loss/idx": 4.5, "loss/logits": 0.1723143458366394, "step": 755 }, { "epoch": 0.011288720984925974, "grad_norm": 0.67578125, "grad_norm_var": 0.10964341163635254, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.5767829418182373, "loss/fcd": 1.41796875, "loss/idx": 4.5, "loss/logits": 0.1736106276512146, "step": 756 }, { "epoch": 0.011303653155540955, "grad_norm": 0.9375, "grad_norm_var": 0.045402002334594724, "learning_rate": 0.0001, "loss": 1.7466, "loss/crossentropy": 2.8778876066207886, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.19187040627002716, "step": 757 }, { "epoch": 0.011318585326155936, "grad_norm": 0.59765625, "grad_norm_var": 0.045402002334594724, "learning_rate": 0.0001, "loss": 1.736, "loss/crossentropy": 2.4122936725616455, "loss/fcd": 1.5390625, "loss/idx": 4.5, "loss/logits": 0.19690872728824615, "step": 758 }, { "epoch": 0.011333517496770917, "grad_norm": 0.98828125, "grad_norm_var": 0.05002439816792806, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.531772494316101, "loss/fcd": 1.51171875, "loss/idx": 4.5, "loss/logits": 0.2037365883588791, "step": 759 }, { "epoch": 0.0113484496673859, "grad_norm": 0.58984375, "grad_norm_var": 0.04616063435872396, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.5002293586730957, "loss/fcd": 1.44140625, "loss/idx": 4.5, "loss/logits": 0.17570620775222778, "step": 760 }, { "epoch": 0.011363381838000881, "grad_norm": 0.65625, "grad_norm_var": 0.04612325032552083, "learning_rate": 0.0001, "loss": 1.8533, "loss/crossentropy": 2.38156259059906, "loss/fcd": 1.63671875, "loss/idx": 4.5, "loss/logits": 0.21658504754304886, "step": 761 }, { "epoch": 0.011378314008615862, "grad_norm": 0.625, "grad_norm_var": 0.019731648763020835, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.541592836380005, "loss/fcd": 1.4140625, "loss/idx": 4.5, "loss/logits": 0.17004821822047234, "step": 762 }, { "epoch": 0.011393246179230845, "grad_norm": 0.69140625, "grad_norm_var": 0.019724527994791668, "learning_rate": 0.0001, "loss": 1.6894, "loss/crossentropy": 2.7292309999465942, "loss/fcd": 1.484375, "loss/idx": 4.5, "loss/logits": 0.20505433529615402, "step": 763 }, { "epoch": 0.011408178349845826, "grad_norm": 0.53125, "grad_norm_var": 0.02124201456705729, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.6666187047958374, "loss/fcd": 1.3828125, "loss/idx": 4.5, "loss/logits": 0.16459019482135773, "step": 764 }, { "epoch": 0.011423110520460807, "grad_norm": 0.6796875, "grad_norm_var": 0.0212371826171875, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.5730679035186768, "loss/fcd": 1.34375, "loss/idx": 4.5, "loss/logits": 0.16394272446632385, "step": 765 }, { "epoch": 0.011438042691075788, "grad_norm": 0.71875, "grad_norm_var": 0.020587158203125, "learning_rate": 0.0001, "loss": 1.6541, "loss/crossentropy": 2.655824899673462, "loss/fcd": 1.45703125, "loss/idx": 4.5, "loss/logits": 0.19702915847301483, "step": 766 }, { "epoch": 0.01145297486169077, "grad_norm": 0.60546875, "grad_norm_var": 0.021008745829264323, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.477110981941223, "loss/fcd": 1.2890625, "loss/idx": 4.5, "loss/logits": 0.14447420090436935, "step": 767 }, { "epoch": 0.011467907032305751, "grad_norm": 0.73046875, "grad_norm_var": 0.01706689198811849, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.8458136320114136, "loss/fcd": 1.2890625, "loss/idx": 4.5, "loss/logits": 0.15027117729187012, "step": 768 }, { "epoch": 0.011482839202920732, "grad_norm": 0.5703125, "grad_norm_var": 0.017626380920410155, "learning_rate": 0.0001, "loss": 1.6885, "loss/crossentropy": 2.53455650806427, "loss/fcd": 1.48046875, "loss/idx": 4.5, "loss/logits": 0.20800386369228363, "step": 769 }, { "epoch": 0.011497771373535713, "grad_norm": 0.59765625, "grad_norm_var": 0.0172149658203125, "learning_rate": 0.0001, "loss": 1.5591, "loss/crossentropy": 2.6400580406188965, "loss/fcd": 1.390625, "loss/idx": 4.5, "loss/logits": 0.16844240576028824, "step": 770 }, { "epoch": 0.011512703544150696, "grad_norm": 0.73046875, "grad_norm_var": 0.015529823303222657, "learning_rate": 0.0001, "loss": 1.5597, "loss/crossentropy": 2.5259393453598022, "loss/fcd": 1.390625, "loss/idx": 4.5, "loss/logits": 0.16906024515628815, "step": 771 }, { "epoch": 0.011527635714765677, "grad_norm": 0.59375, "grad_norm_var": 0.01602783203125, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.5816057920455933, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.16103952378034592, "step": 772 }, { "epoch": 0.011542567885380658, "grad_norm": 0.734375, "grad_norm_var": 0.011571248372395834, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.8559324741363525, "loss/fcd": 1.38671875, "loss/idx": 4.5, "loss/logits": 0.15143951773643494, "step": 773 }, { "epoch": 0.01155750005599564, "grad_norm": 0.5703125, "grad_norm_var": 0.011863644917805989, "learning_rate": 0.0001, "loss": 1.5398, "loss/crossentropy": 2.618905544281006, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.16479819267988205, "step": 774 }, { "epoch": 0.011572432226610621, "grad_norm": 0.7265625, "grad_norm_var": 0.004805246988932292, "learning_rate": 0.0001, "loss": 1.5362, "loss/crossentropy": 2.4410958290100098, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.16121891140937805, "step": 775 }, { "epoch": 0.011587364397225602, "grad_norm": 0.671875, "grad_norm_var": 0.0046009699503580725, "learning_rate": 0.0001, "loss": 1.7557, "loss/crossentropy": 2.30470609664917, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.2010229527950287, "step": 776 }, { "epoch": 0.011602296567840583, "grad_norm": 0.64453125, "grad_norm_var": 0.004603068033854167, "learning_rate": 0.0001, "loss": 1.5888, "loss/crossentropy": 2.371055006980896, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.1825580596923828, "step": 777 }, { "epoch": 0.011617228738455566, "grad_norm": 0.64453125, "grad_norm_var": 0.004558245340983073, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.692077875137329, "loss/fcd": 1.515625, "loss/idx": 4.5, "loss/logits": 0.1926237791776657, "step": 778 }, { "epoch": 0.011632160909070547, "grad_norm": 0.51171875, "grad_norm_var": 0.0056461970011393225, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.7112133502960205, "loss/fcd": 1.21875, "loss/idx": 4.5, "loss/logits": 0.1343764252960682, "step": 779 }, { "epoch": 0.011647093079685528, "grad_norm": 0.6171875, "grad_norm_var": 0.00484612782796224, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.5949923992156982, "loss/fcd": 1.34375, "loss/idx": 4.5, "loss/logits": 0.15916766971349716, "step": 780 }, { "epoch": 0.01166202525030051, "grad_norm": 0.7265625, "grad_norm_var": 0.00518945058186849, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.5727131366729736, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16936737298965454, "step": 781 }, { "epoch": 0.011676957420915492, "grad_norm": 0.61328125, "grad_norm_var": 0.0049130757649739586, "learning_rate": 0.0001, "loss": 1.8494, "loss/crossentropy": 2.4157882928848267, "loss/fcd": 1.640625, "loss/idx": 4.75, "loss/logits": 0.2087443321943283, "step": 782 }, { "epoch": 0.011691889591530473, "grad_norm": 3.34375, "grad_norm_var": 0.45982252756754555, "learning_rate": 0.0001, "loss": 1.911, "loss/crossentropy": 2.7091548442840576, "loss/fcd": 1.703125, "loss/idx": 5.0, "loss/logits": 0.20787205547094345, "step": 783 }, { "epoch": 0.011706821762145454, "grad_norm": 2.828125, "grad_norm_var": 0.7114115397135417, "learning_rate": 0.0001, "loss": 1.8491, "loss/crossentropy": 2.588376045227051, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.1733308508992195, "step": 784 }, { "epoch": 0.011721753932760436, "grad_norm": 1.6328125, "grad_norm_var": 0.7288431803385417, "learning_rate": 0.0001, "loss": 1.881, "loss/crossentropy": 2.472332239151001, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.20525866746902466, "step": 785 }, { "epoch": 0.011736686103375417, "grad_norm": 1.1875, "grad_norm_var": 0.7180236180623373, "learning_rate": 0.0001, "loss": 1.8963, "loss/crossentropy": 2.606132984161377, "loss/fcd": 1.6875, "loss/idx": 5.0, "loss/logits": 0.2088092416524887, "step": 786 }, { "epoch": 0.011751618273990398, "grad_norm": 1.3515625, "grad_norm_var": 0.7157895406087239, "learning_rate": 0.0001, "loss": 2.0326, "loss/crossentropy": 2.485718846321106, "loss/fcd": 1.8125, "loss/idx": 5.0, "loss/logits": 0.22014504671096802, "step": 787 }, { "epoch": 0.011766550444605381, "grad_norm": 0.99609375, "grad_norm_var": 0.699424680074056, "learning_rate": 0.0001, "loss": 1.7551, "loss/crossentropy": 2.6328842639923096, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.1769460216164589, "step": 788 }, { "epoch": 0.011781482615220362, "grad_norm": 1.0859375, "grad_norm_var": 0.6894225438435873, "learning_rate": 0.0001, "loss": 1.78, "loss/crossentropy": 2.5578906536102295, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.15887843072414398, "step": 789 }, { "epoch": 0.011796414785835343, "grad_norm": 0.88671875, "grad_norm_var": 0.6718770345052083, "learning_rate": 0.0001, "loss": 1.8571, "loss/crossentropy": 2.5024824142456055, "loss/fcd": 1.65234375, "loss/idx": 5.0, "loss/logits": 0.20473603904247284, "step": 790 }, { "epoch": 0.011811346956450324, "grad_norm": 1.0234375, "grad_norm_var": 0.6604543050130208, "learning_rate": 0.0001, "loss": 1.8476, "loss/crossentropy": 2.6586837768554688, "loss/fcd": 1.66015625, "loss/idx": 5.0, "loss/logits": 0.18739935755729675, "step": 791 }, { "epoch": 0.011826279127065306, "grad_norm": 1.1640625, "grad_norm_var": 0.642718251546224, "learning_rate": 0.0001, "loss": 2.1369, "loss/crossentropy": 2.6025651693344116, "loss/fcd": 1.89453125, "loss/idx": 5.0, "loss/logits": 0.24233842641115189, "step": 792 }, { "epoch": 0.011841211297680287, "grad_norm": 0.78125, "grad_norm_var": 0.6336949030558269, "learning_rate": 0.0001, "loss": 1.7646, "loss/crossentropy": 2.4655721187591553, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.18643079698085785, "step": 793 }, { "epoch": 0.011856143468295268, "grad_norm": 0.8125, "grad_norm_var": 0.6227457682291667, "learning_rate": 0.0001, "loss": 1.7683, "loss/crossentropy": 2.915129542350769, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.1980355903506279, "step": 794 }, { "epoch": 0.01187107563891025, "grad_norm": 0.77734375, "grad_norm_var": 0.6019765218098958, "learning_rate": 0.0001, "loss": 1.7054, "loss/crossentropy": 2.5375667810440063, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.1780666932463646, "step": 795 }, { "epoch": 0.011886007809525232, "grad_norm": 0.8515625, "grad_norm_var": 0.5859700520833333, "learning_rate": 0.0001, "loss": 1.9032, "loss/crossentropy": 2.496041178703308, "loss/fcd": 1.6875, "loss/idx": 5.0, "loss/logits": 0.21571382135152817, "step": 796 }, { "epoch": 0.011900939980140213, "grad_norm": 0.796875, "grad_norm_var": 0.5813351949055989, "learning_rate": 0.0001, "loss": 1.7965, "loss/crossentropy": 2.4716135263442993, "loss/fcd": 1.6015625, "loss/idx": 5.0, "loss/logits": 0.19488991051912308, "step": 797 }, { "epoch": 0.011915872150755194, "grad_norm": 0.75, "grad_norm_var": 0.5707452774047852, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.6149767637252808, "loss/fcd": 1.48046875, "loss/idx": 5.0, "loss/logits": 0.17144525051116943, "step": 798 }, { "epoch": 0.011930804321370177, "grad_norm": 1.546875, "grad_norm_var": 0.27495110829671227, "learning_rate": 0.0001, "loss": 2.0333, "loss/crossentropy": 2.268043637275696, "loss/fcd": 1.8359375, "loss/idx": 5.0, "loss/logits": 0.19733671098947525, "step": 799 }, { "epoch": 0.011945736491985158, "grad_norm": 0.80859375, "grad_norm_var": 0.07921040852864583, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.6515145301818848, "loss/fcd": 1.53515625, "loss/idx": 5.0, "loss/logits": 0.1877676323056221, "step": 800 }, { "epoch": 0.011960668662600139, "grad_norm": 0.80859375, "grad_norm_var": 0.055237770080566406, "learning_rate": 0.0001, "loss": 1.9004, "loss/crossentropy": 2.5546780824661255, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.21680361777544022, "step": 801 }, { "epoch": 0.01197560083321512, "grad_norm": 0.76171875, "grad_norm_var": 0.05460713704427083, "learning_rate": 0.0001, "loss": 1.7663, "loss/crossentropy": 2.465883493423462, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.18430602550506592, "step": 802 }, { "epoch": 0.011990533003830102, "grad_norm": 0.859375, "grad_norm_var": 0.04340794881184896, "learning_rate": 0.0001, "loss": 1.7138, "loss/crossentropy": 2.5838167667388916, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.1825515776872635, "step": 803 }, { "epoch": 0.012005465174445083, "grad_norm": 0.66796875, "grad_norm_var": 0.046783192952473955, "learning_rate": 0.0001, "loss": 1.7907, "loss/crossentropy": 2.5992263555526733, "loss/fcd": 1.58984375, "loss/idx": 5.0, "loss/logits": 0.20081757754087448, "step": 804 }, { "epoch": 0.012020397345060064, "grad_norm": 0.67578125, "grad_norm_var": 0.047070248921712236, "learning_rate": 0.0001, "loss": 1.6835, "loss/crossentropy": 2.5859053134918213, "loss/fcd": 1.51171875, "loss/idx": 5.0, "loss/logits": 0.17173586785793304, "step": 805 }, { "epoch": 0.012035329515675047, "grad_norm": 0.6484375, "grad_norm_var": 0.0501922607421875, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.4893710613250732, "loss/fcd": 1.43359375, "loss/idx": 5.0, "loss/logits": 0.17046400159597397, "step": 806 }, { "epoch": 0.012050261686290028, "grad_norm": 0.7265625, "grad_norm_var": 0.04916788736979167, "learning_rate": 0.0001, "loss": 1.8526, "loss/crossentropy": 2.5315016508102417, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.23148700594902039, "step": 807 }, { "epoch": 0.012065193856905009, "grad_norm": 0.83984375, "grad_norm_var": 0.041722043355305986, "learning_rate": 0.0001, "loss": 1.9535, "loss/crossentropy": 2.791074752807617, "loss/fcd": 1.7109375, "loss/idx": 5.0, "loss/logits": 0.24258101731538773, "step": 808 }, { "epoch": 0.01208012602751999, "grad_norm": 0.73828125, "grad_norm_var": 0.042057037353515625, "learning_rate": 0.0001, "loss": 1.6953, "loss/crossentropy": 2.7802544832229614, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.1679670214653015, "step": 809 }, { "epoch": 0.012095058198134973, "grad_norm": 0.79296875, "grad_norm_var": 0.042092323303222656, "learning_rate": 0.0001, "loss": 1.6601, "loss/crossentropy": 2.8163868188858032, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.1679074689745903, "step": 810 }, { "epoch": 0.012109990368749953, "grad_norm": 0.65625, "grad_norm_var": 0.04362767537434896, "learning_rate": 0.0001, "loss": 1.6613, "loss/crossentropy": 2.4688737392425537, "loss/fcd": 1.48828125, "loss/idx": 5.0, "loss/logits": 0.17299792170524597, "step": 811 }, { "epoch": 0.012124922539364934, "grad_norm": 0.77734375, "grad_norm_var": 0.043541908264160156, "learning_rate": 0.0001, "loss": 1.6717, "loss/crossentropy": 2.6986693143844604, "loss/fcd": 1.48828125, "loss/idx": 5.0, "loss/logits": 0.18341868370771408, "step": 812 }, { "epoch": 0.012139854709979915, "grad_norm": 0.9609375, "grad_norm_var": 0.04507999420166016, "learning_rate": 0.0001, "loss": 1.8393, "loss/crossentropy": 2.588726282119751, "loss/fcd": 1.6171875, "loss/idx": 5.0, "loss/logits": 0.22206994891166687, "step": 813 }, { "epoch": 0.012154786880594898, "grad_norm": 0.6796875, "grad_norm_var": 0.045986366271972653, "learning_rate": 0.0001, "loss": 1.766, "loss/crossentropy": 2.444581627845764, "loss/fcd": 1.5625, "loss/idx": 5.0, "loss/logits": 0.20353230834007263, "step": 814 }, { "epoch": 0.012169719051209879, "grad_norm": 0.765625, "grad_norm_var": 0.007305335998535156, "learning_rate": 0.0001, "loss": 1.8629, "loss/crossentropy": 2.46638023853302, "loss/fcd": 1.64453125, "loss/idx": 5.0, "loss/logits": 0.21839633584022522, "step": 815 }, { "epoch": 0.01218465122182486, "grad_norm": 0.69921875, "grad_norm_var": 0.00735162099202474, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.41643226146698, "loss/fcd": 1.45703125, "loss/idx": 5.0, "loss/logits": 0.17698514461517334, "step": 816 }, { "epoch": 0.012199583392439843, "grad_norm": 0.82421875, "grad_norm_var": 0.00748132069905599, "learning_rate": 0.0001, "loss": 1.9725, "loss/crossentropy": 2.3987841606140137, "loss/fcd": 1.73046875, "loss/idx": 5.0, "loss/logits": 0.24200860410928726, "step": 817 }, { "epoch": 0.012214515563054824, "grad_norm": 0.56640625, "grad_norm_var": 0.009681129455566406, "learning_rate": 0.0001, "loss": 1.5978, "loss/crossentropy": 2.526577949523926, "loss/fcd": 1.4453125, "loss/idx": 5.0, "loss/logits": 0.15250347554683685, "step": 818 }, { "epoch": 0.012229447733669805, "grad_norm": 1.234375, "grad_norm_var": 0.024317359924316405, "learning_rate": 0.0001, "loss": 1.9582, "loss/crossentropy": 2.2526134252548218, "loss/fcd": 1.7734375, "loss/idx": 5.0, "loss/logits": 0.18472721427679062, "step": 819 }, { "epoch": 0.012244379904284786, "grad_norm": 0.84765625, "grad_norm_var": 0.02398980458577474, "learning_rate": 0.0001, "loss": 1.9581, "loss/crossentropy": 2.593246102333069, "loss/fcd": 1.734375, "loss/idx": 5.0, "loss/logits": 0.223709836602211, "step": 820 }, { "epoch": 0.012259312074899768, "grad_norm": 0.6953125, "grad_norm_var": 0.023749796549479167, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.491560935974121, "loss/fcd": 1.5078125, "loss/idx": 5.0, "loss/logits": 0.18277642875909805, "step": 821 }, { "epoch": 0.01227424424551475, "grad_norm": 0.6953125, "grad_norm_var": 0.023075358072916666, "learning_rate": 0.0001, "loss": 1.7343, "loss/crossentropy": 2.584471344947815, "loss/fcd": 1.5390625, "loss/idx": 5.0, "loss/logits": 0.19519615173339844, "step": 822 }, { "epoch": 0.01228917641612973, "grad_norm": 0.7578125, "grad_norm_var": 0.022908528645833332, "learning_rate": 0.0001, "loss": 1.7162, "loss/crossentropy": 2.3712323904037476, "loss/fcd": 1.54296875, "loss/idx": 5.0, "loss/logits": 0.17325877398252487, "step": 823 }, { "epoch": 0.012304108586744713, "grad_norm": 0.8515625, "grad_norm_var": 0.02300561269124349, "learning_rate": 0.0001, "loss": 1.8932, "loss/crossentropy": 2.8186241388320923, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.21743790805339813, "step": 824 }, { "epoch": 0.012319040757359694, "grad_norm": 0.58984375, "grad_norm_var": 0.025286293029785155, "learning_rate": 0.0001, "loss": 1.5933, "loss/crossentropy": 2.623106360435486, "loss/fcd": 1.43359375, "loss/idx": 5.0, "loss/logits": 0.15974538028240204, "step": 825 }, { "epoch": 0.012333972927974675, "grad_norm": 0.828125, "grad_norm_var": 0.025449371337890624, "learning_rate": 0.0001, "loss": 1.7932, "loss/crossentropy": 2.6241841316223145, "loss/fcd": 1.60546875, "loss/idx": 5.0, "loss/logits": 0.18778088688850403, "step": 826 }, { "epoch": 0.012348905098589656, "grad_norm": 0.74609375, "grad_norm_var": 0.024509112040201824, "learning_rate": 0.0001, "loss": 1.7246, "loss/crossentropy": 2.3002325296401978, "loss/fcd": 1.5546875, "loss/idx": 5.0, "loss/logits": 0.16992325335741043, "step": 827 }, { "epoch": 0.012363837269204639, "grad_norm": 0.76953125, "grad_norm_var": 0.024518267313639323, "learning_rate": 0.0001, "loss": 1.6774, "loss/crossentropy": 2.769911289215088, "loss/fcd": 1.49609375, "loss/idx": 5.0, "loss/logits": 0.18125663697719574, "step": 828 }, { "epoch": 0.01237876943981962, "grad_norm": 0.80078125, "grad_norm_var": 0.022299957275390626, "learning_rate": 0.0001, "loss": 1.6901, "loss/crossentropy": 2.5946470499038696, "loss/fcd": 1.51953125, "loss/idx": 5.0, "loss/logits": 0.17055295407772064, "step": 829 }, { "epoch": 0.0123937016104346, "grad_norm": 0.6484375, "grad_norm_var": 0.022745513916015626, "learning_rate": 0.0001, "loss": 1.6234, "loss/crossentropy": 2.5415327548980713, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.15074985474348068, "step": 830 }, { "epoch": 0.012408633781049581, "grad_norm": 0.6328125, "grad_norm_var": 0.02392578125, "learning_rate": 0.0001, "loss": 1.7857, "loss/crossentropy": 2.580002784729004, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.2036324143409729, "step": 831 }, { "epoch": 0.012423565951664564, "grad_norm": 0.87890625, "grad_norm_var": 0.02444636027018229, "learning_rate": 0.0001, "loss": 1.7464, "loss/crossentropy": 2.715728998184204, "loss/fcd": 1.5625, "loss/idx": 5.0, "loss/logits": 0.18391364812850952, "step": 832 }, { "epoch": 0.012438498122279545, "grad_norm": 0.86328125, "grad_norm_var": 0.024808756510416665, "learning_rate": 0.0001, "loss": 1.9102, "loss/crossentropy": 2.602025628089905, "loss/fcd": 1.70703125, "loss/idx": 5.0, "loss/logits": 0.2031889334321022, "step": 833 }, { "epoch": 0.012453430292894526, "grad_norm": 0.8125, "grad_norm_var": 0.021736590067545573, "learning_rate": 0.0001, "loss": 1.817, "loss/crossentropy": 2.6534132957458496, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.195940762758255, "step": 834 }, { "epoch": 0.012468362463509509, "grad_norm": 0.59375, "grad_norm_var": 0.00949548085530599, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.453692674636841, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.17016924917697906, "step": 835 }, { "epoch": 0.01248329463412449, "grad_norm": 0.78515625, "grad_norm_var": 0.008931922912597656, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.7629364728927612, "loss/fcd": 1.49609375, "loss/idx": 5.0, "loss/logits": 0.16993117332458496, "step": 836 }, { "epoch": 0.01249822680473947, "grad_norm": 0.67578125, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.5838505029678345, "loss/fcd": 1.4609375, "loss/idx": 5.0, "loss/logits": 0.16975942254066467, "step": 837 }, { "epoch": 0.012513158975354452, "grad_norm": 0.66796875, "grad_norm_var": 0.00932000478108724, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.676839232444763, "loss/fcd": 1.453125, "loss/idx": 5.0, "loss/logits": 0.15447519719600677, "step": 838 }, { "epoch": 0.012528091145969434, "grad_norm": 0.73828125, "grad_norm_var": 0.009307607014973959, "learning_rate": 0.0001, "loss": 1.9002, "loss/crossentropy": 2.4299418926239014, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.2166183888912201, "step": 839 }, { "epoch": 0.012543023316584415, "grad_norm": 0.71875, "grad_norm_var": 0.008481852213541667, "learning_rate": 0.0001, "loss": 1.6825, "loss/crossentropy": 2.6252626180648804, "loss/fcd": 1.51171875, "loss/idx": 5.0, "loss/logits": 0.17076187580823898, "step": 840 }, { "epoch": 0.012557955487199396, "grad_norm": 0.94140625, "grad_norm_var": 0.009431711832682292, "learning_rate": 0.0001, "loss": 1.8972, "loss/crossentropy": 2.530099391937256, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.21358779817819595, "step": 841 }, { "epoch": 0.012572887657814379, "grad_norm": 0.7109375, "grad_norm_var": 0.009168497721354167, "learning_rate": 0.0001, "loss": 1.6495, "loss/crossentropy": 2.651611328125, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.17685066163539886, "step": 842 }, { "epoch": 0.01258781982842936, "grad_norm": 0.75, "grad_norm_var": 0.009167925516764323, "learning_rate": 0.0001, "loss": 1.6369, "loss/crossentropy": 2.6540257930755615, "loss/fcd": 1.46875, "loss/idx": 5.0, "loss/logits": 0.16816890239715576, "step": 843 }, { "epoch": 0.012602751999044341, "grad_norm": 0.6796875, "grad_norm_var": 0.009429677327473959, "learning_rate": 0.0001, "loss": 1.681, "loss/crossentropy": 2.589913010597229, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.18879953026771545, "step": 844 }, { "epoch": 0.012617684169659322, "grad_norm": 0.69921875, "grad_norm_var": 0.009300740559895833, "learning_rate": 0.0001, "loss": 1.6358, "loss/crossentropy": 2.5078059434890747, "loss/fcd": 1.48046875, "loss/idx": 5.0, "loss/logits": 0.15536697953939438, "step": 845 }, { "epoch": 0.012632616340274305, "grad_norm": 0.96875, "grad_norm_var": 0.011917877197265624, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.7713186740875244, "loss/fcd": 1.4296875, "loss/idx": 5.0, "loss/logits": 0.1565636619925499, "step": 846 }, { "epoch": 0.012647548510889286, "grad_norm": 0.671875, "grad_norm_var": 0.01136474609375, "learning_rate": 0.0001, "loss": 1.5134, "loss/crossentropy": 2.5787158012390137, "loss/fcd": 1.37109375, "loss/idx": 5.0, "loss/logits": 0.14233843982219696, "step": 847 }, { "epoch": 0.012662480681504266, "grad_norm": 0.71875, "grad_norm_var": 0.010423723856608074, "learning_rate": 0.0001, "loss": 1.7599, "loss/crossentropy": 2.6209789514541626, "loss/fcd": 1.57421875, "loss/idx": 5.0, "loss/logits": 0.18572837859392166, "step": 848 }, { "epoch": 0.01267741285211925, "grad_norm": 0.87109375, "grad_norm_var": 0.010545794169108074, "learning_rate": 0.0001, "loss": 1.7243, "loss/crossentropy": 3.124199151992798, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.19692812114953995, "step": 849 }, { "epoch": 0.01269234502273423, "grad_norm": 0.70703125, "grad_norm_var": 0.010365549723307292, "learning_rate": 0.0001, "loss": 1.8374, "loss/crossentropy": 2.7884299755096436, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.21627703309059143, "step": 850 }, { "epoch": 0.012707277193349211, "grad_norm": 0.7421875, "grad_norm_var": 0.008775838216145833, "learning_rate": 0.0001, "loss": 1.8568, "loss/crossentropy": 2.615527868270874, "loss/fcd": 1.63671875, "loss/idx": 5.0, "loss/logits": 0.22006352990865707, "step": 851 }, { "epoch": 0.012722209363964192, "grad_norm": 0.6484375, "grad_norm_var": 0.00935662587483724, "learning_rate": 0.0001, "loss": 1.7633, "loss/crossentropy": 2.4802143573760986, "loss/fcd": 1.56640625, "loss/idx": 5.0, "loss/logits": 0.19686861336231232, "step": 852 }, { "epoch": 0.012737141534579175, "grad_norm": 0.78515625, "grad_norm_var": 0.009103838602701824, "learning_rate": 0.0001, "loss": 1.7749, "loss/crossentropy": 2.7521532773971558, "loss/fcd": 1.5859375, "loss/idx": 5.0, "loss/logits": 0.18896755576133728, "step": 853 }, { "epoch": 0.012752073705194156, "grad_norm": 0.73046875, "grad_norm_var": 0.008654212951660157, "learning_rate": 0.0001, "loss": 1.9076, "loss/crossentropy": 2.4716649055480957, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.22396356612443924, "step": 854 }, { "epoch": 0.012767005875809137, "grad_norm": 0.87109375, "grad_norm_var": 0.009458351135253906, "learning_rate": 0.0001, "loss": 1.7463, "loss/crossentropy": 2.7693766355514526, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.17594317346811295, "step": 855 }, { "epoch": 0.012781938046424118, "grad_norm": 0.8203125, "grad_norm_var": 0.009498023986816406, "learning_rate": 0.0001, "loss": 1.8636, "loss/crossentropy": 2.4391822814941406, "loss/fcd": 1.63671875, "loss/idx": 5.0, "loss/logits": 0.22686351835727692, "step": 856 }, { "epoch": 0.0127968702170391, "grad_norm": 0.671875, "grad_norm_var": 0.0078704833984375, "learning_rate": 0.0001, "loss": 1.7254, "loss/crossentropy": 2.591761350631714, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.19416391849517822, "step": 857 }, { "epoch": 0.012811802387654081, "grad_norm": 0.8203125, "grad_norm_var": 0.008005777994791666, "learning_rate": 0.0001, "loss": 1.8034, "loss/crossentropy": 2.520877718925476, "loss/fcd": 1.60546875, "loss/idx": 5.0, "loss/logits": 0.1978968232870102, "step": 858 }, { "epoch": 0.012826734558269062, "grad_norm": 0.6875, "grad_norm_var": 0.008331298828125, "learning_rate": 0.0001, "loss": 1.7766, "loss/crossentropy": 2.5855950117111206, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.198471337556839, "step": 859 }, { "epoch": 0.012841666728884045, "grad_norm": 0.64453125, "grad_norm_var": 0.008765602111816406, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.7856252193450928, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.19874712824821472, "step": 860 }, { "epoch": 0.012856598899499026, "grad_norm": 0.6640625, "grad_norm_var": 0.009098052978515625, "learning_rate": 0.0001, "loss": 1.7602, "loss/crossentropy": 2.439252257347107, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.18988988548517227, "step": 861 }, { "epoch": 0.012871531070114007, "grad_norm": 0.69921875, "grad_norm_var": 0.005829811096191406, "learning_rate": 0.0001, "loss": 1.585, "loss/crossentropy": 2.6466290950775146, "loss/fcd": 1.421875, "loss/idx": 5.0, "loss/logits": 0.16309361904859543, "step": 862 }, { "epoch": 0.012886463240728988, "grad_norm": 0.64453125, "grad_norm_var": 0.006105295817057292, "learning_rate": 0.0001, "loss": 1.7727, "loss/crossentropy": 2.5705440044403076, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.19455686211585999, "step": 863 }, { "epoch": 0.01290139541134397, "grad_norm": 0.61328125, "grad_norm_var": 0.006999651590983073, "learning_rate": 0.0001, "loss": 1.6992, "loss/crossentropy": 2.6901649236679077, "loss/fcd": 1.515625, "loss/idx": 5.0, "loss/logits": 0.18361696600914001, "step": 864 }, { "epoch": 0.012916327581958952, "grad_norm": 0.81640625, "grad_norm_var": 0.00613091786702474, "learning_rate": 0.0001, "loss": 1.7852, "loss/crossentropy": 2.6737335920333862, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.20320622622966766, "step": 865 }, { "epoch": 0.012931259752573932, "grad_norm": 0.671875, "grad_norm_var": 0.0062825520833333336, "learning_rate": 0.0001, "loss": 1.5996, "loss/crossentropy": 2.6164817810058594, "loss/fcd": 1.42578125, "loss/idx": 5.0, "loss/logits": 0.17384624481201172, "step": 866 }, { "epoch": 0.012946191923188915, "grad_norm": 0.6953125, "grad_norm_var": 0.006285603841145833, "learning_rate": 0.0001, "loss": 1.6747, "loss/crossentropy": 2.4937325716018677, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.18247970193624496, "step": 867 }, { "epoch": 0.012961124093803896, "grad_norm": 0.66015625, "grad_norm_var": 0.006185849507649739, "learning_rate": 0.0001, "loss": 1.716, "loss/crossentropy": 2.7091100215911865, "loss/fcd": 1.5234375, "loss/idx": 5.0, "loss/logits": 0.19258494675159454, "step": 868 }, { "epoch": 0.012976056264418877, "grad_norm": 0.72265625, "grad_norm_var": 0.00587457021077474, "learning_rate": 0.0001, "loss": 1.7992, "loss/crossentropy": 2.482669949531555, "loss/fcd": 1.59375, "loss/idx": 5.0, "loss/logits": 0.20546174049377441, "step": 869 }, { "epoch": 0.012990988435033858, "grad_norm": 0.6484375, "grad_norm_var": 0.006121571858723958, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.4320571422576904, "loss/fcd": 1.484375, "loss/idx": 5.0, "loss/logits": 0.18284663558006287, "step": 870 }, { "epoch": 0.01300592060564884, "grad_norm": 0.5625, "grad_norm_var": 0.005423418680826823, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.7169127464294434, "loss/fcd": 1.41015625, "loss/idx": 5.0, "loss/logits": 0.16260702162981033, "step": 871 }, { "epoch": 0.013020852776263822, "grad_norm": 0.796875, "grad_norm_var": 0.0050511042277018225, "learning_rate": 0.0001, "loss": 1.6553, "loss/crossentropy": 2.3346874713897705, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.1631385162472725, "step": 872 }, { "epoch": 0.013035784946878803, "grad_norm": 0.78515625, "grad_norm_var": 0.005598704020182292, "learning_rate": 0.0001, "loss": 1.5253, "loss/crossentropy": 2.60079562664032, "loss/fcd": 1.3828125, "loss/idx": 5.0, "loss/logits": 0.14248249679803848, "step": 873 }, { "epoch": 0.013050717117493784, "grad_norm": 0.365234375, "grad_norm_var": 0.010987202326456705, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.696221947669983, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.15895313769578934, "step": 874 }, { "epoch": 0.013065649288108766, "grad_norm": 1.3828125, "grad_norm_var": 0.043070713678995766, "learning_rate": 0.0001, "loss": 3.0924, "loss/crossentropy": 2.6322977542877197, "loss/fcd": 2.62109375, "loss/idx": 5.5, "loss/logits": 0.47132138907909393, "step": 875 }, { "epoch": 0.013080581458723747, "grad_norm": 0.3828125, "grad_norm_var": 0.04966479937235514, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.672438383102417, "loss/fcd": 1.43359375, "loss/idx": 5.5, "loss/logits": 0.17659874260425568, "step": 876 }, { "epoch": 0.013095513629338728, "grad_norm": 0.345703125, "grad_norm_var": 0.05728956858317057, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.610317587852478, "loss/fcd": 1.390625, "loss/idx": 5.5, "loss/logits": 0.17338179051876068, "step": 877 }, { "epoch": 0.013110445799953711, "grad_norm": 0.486328125, "grad_norm_var": 0.05942228635152181, "learning_rate": 0.0001, "loss": 1.6579, "loss/crossentropy": 2.7957422733306885, "loss/fcd": 1.46484375, "loss/idx": 5.5, "loss/logits": 0.19305869936943054, "step": 878 }, { "epoch": 0.013125377970568692, "grad_norm": 0.484375, "grad_norm_var": 0.06138253211975098, "learning_rate": 0.0001, "loss": 1.8523, "loss/crossentropy": 2.4633307456970215, "loss/fcd": 1.61328125, "loss/idx": 5.5, "loss/logits": 0.23906587064266205, "step": 879 }, { "epoch": 0.013140310141183673, "grad_norm": 0.330078125, "grad_norm_var": 0.06782881418863933, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.651333808898926, "loss/fcd": 1.34375, "loss/idx": 5.5, "loss/logits": 0.16093496978282928, "step": 880 }, { "epoch": 0.013155242311798654, "grad_norm": 0.36328125, "grad_norm_var": 0.06961358388264974, "learning_rate": 0.0001, "loss": 1.6017, "loss/crossentropy": 2.7588586807250977, "loss/fcd": 1.421875, "loss/idx": 5.5, "loss/logits": 0.17987017333507538, "step": 881 }, { "epoch": 0.013170174482413637, "grad_norm": 0.298828125, "grad_norm_var": 0.07499616940816244, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.5945587158203125, "loss/fcd": 1.328125, "loss/idx": 5.5, "loss/logits": 0.16634615510702133, "step": 882 }, { "epoch": 0.013185106653028618, "grad_norm": 0.369140625, "grad_norm_var": 0.07671356201171875, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.6372928619384766, "loss/fcd": 1.39453125, "loss/idx": 5.5, "loss/logits": 0.17521393299102783, "step": 883 }, { "epoch": 0.013200038823643599, "grad_norm": 0.47265625, "grad_norm_var": 0.07644500732421874, "learning_rate": 0.0001, "loss": 1.6773, "loss/crossentropy": 2.472917318344116, "loss/fcd": 1.4921875, "loss/idx": 5.5, "loss/logits": 0.18511150032281876, "step": 884 }, { "epoch": 0.013214970994258581, "grad_norm": 0.310546875, "grad_norm_var": 0.07756180763244629, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.264296770095825, "loss/fcd": 1.40625, "loss/idx": 5.5, "loss/logits": 0.18537598103284836, "step": 885 }, { "epoch": 0.013229903164873562, "grad_norm": 0.37890625, "grad_norm_var": 0.07763199806213379, "learning_rate": 0.0001, "loss": 1.6628, "loss/crossentropy": 2.6821300983428955, "loss/fcd": 1.45703125, "loss/idx": 5.5, "loss/logits": 0.20572075992822647, "step": 886 }, { "epoch": 0.013244835335488543, "grad_norm": 0.314453125, "grad_norm_var": 0.07964859008789063, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.712405562400818, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.1682407334446907, "step": 887 }, { "epoch": 0.013259767506103524, "grad_norm": 0.392578125, "grad_norm_var": 0.07341370582580567, "learning_rate": 0.0001, "loss": 1.5617, "loss/crossentropy": 2.6705490350723267, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.17886866629123688, "step": 888 }, { "epoch": 0.013274699676718507, "grad_norm": 0.474609375, "grad_norm_var": 0.06624393463134766, "learning_rate": 0.0001, "loss": 1.7751, "loss/crossentropy": 2.8483877182006836, "loss/fcd": 1.5546875, "loss/idx": 5.5, "loss/logits": 0.22039098292589188, "step": 889 }, { "epoch": 0.013289631847333488, "grad_norm": 0.7421875, "grad_norm_var": 0.07101413408915201, "learning_rate": 0.0001, "loss": 1.7931, "loss/crossentropy": 2.6503173112869263, "loss/fcd": 1.578125, "loss/idx": 5.5, "loss/logits": 0.21500100940465927, "step": 890 }, { "epoch": 0.013304564017948469, "grad_norm": 0.306640625, "grad_norm_var": 0.012502543131510417, "learning_rate": 0.0001, "loss": 1.5623, "loss/crossentropy": 2.57335889339447, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.1794382557272911, "step": 891 }, { "epoch": 0.01331949618856345, "grad_norm": 0.388671875, "grad_norm_var": 0.012488667170206707, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.6493390798568726, "loss/fcd": 1.390625, "loss/idx": 5.5, "loss/logits": 0.17513766884803772, "step": 892 }, { "epoch": 0.013334428359178432, "grad_norm": 0.365234375, "grad_norm_var": 0.012361510594685873, "learning_rate": 0.0001, "loss": 1.6516, "loss/crossentropy": 2.388102173805237, "loss/fcd": 1.46484375, "loss/idx": 5.5, "loss/logits": 0.18672683835029602, "step": 893 }, { "epoch": 0.013349360529793413, "grad_norm": 0.392578125, "grad_norm_var": 0.011893065770467122, "learning_rate": 0.0001, "loss": 1.5088, "loss/crossentropy": 2.585448145866394, "loss/fcd": 1.33984375, "loss/idx": 5.5, "loss/logits": 0.16900470852851868, "step": 894 }, { "epoch": 0.013364292700408394, "grad_norm": 0.333984375, "grad_norm_var": 0.011595662434895833, "learning_rate": 0.0001, "loss": 1.6176, "loss/crossentropy": 2.563124418258667, "loss/fcd": 1.41796875, "loss/idx": 5.5, "loss/logits": 0.19963373243808746, "step": 895 }, { "epoch": 0.013379224871023377, "grad_norm": 0.46875, "grad_norm_var": 0.011696100234985352, "learning_rate": 0.0001, "loss": 1.7444, "loss/crossentropy": 2.6045339107513428, "loss/fcd": 1.53515625, "loss/idx": 5.5, "loss/logits": 0.20921117812395096, "step": 896 }, { "epoch": 0.013394157041638358, "grad_norm": 0.400390625, "grad_norm_var": 0.01160882314046224, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.5683369636535645, "loss/fcd": 1.4140625, "loss/idx": 5.5, "loss/logits": 0.17578723281621933, "step": 897 }, { "epoch": 0.013409089212253339, "grad_norm": 0.421875, "grad_norm_var": 0.010884841283162435, "learning_rate": 0.0001, "loss": 1.6952, "loss/crossentropy": 2.586871862411499, "loss/fcd": 1.46875, "loss/idx": 5.5, "loss/logits": 0.22648146003484726, "step": 898 }, { "epoch": 0.01342402138286832, "grad_norm": 0.37109375, "grad_norm_var": 0.01087487538655599, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.4116278886795044, "loss/fcd": 1.47265625, "loss/idx": 5.5, "loss/logits": 0.18976984173059464, "step": 899 }, { "epoch": 0.013438953553483303, "grad_norm": 0.37890625, "grad_norm_var": 0.010621579488118489, "learning_rate": 0.0001, "loss": 1.7932, "loss/crossentropy": 2.4072563648223877, "loss/fcd": 1.5703125, "loss/idx": 5.5, "loss/logits": 0.22293731570243835, "step": 900 }, { "epoch": 0.013453885724098284, "grad_norm": 0.396484375, "grad_norm_var": 0.010028521219889322, "learning_rate": 0.0001, "loss": 1.6985, "loss/crossentropy": 2.457371711730957, "loss/fcd": 1.49609375, "loss/idx": 5.5, "loss/logits": 0.202431321144104, "step": 901 }, { "epoch": 0.013468817894713265, "grad_norm": 0.384765625, "grad_norm_var": 0.010007969538370768, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.6801459789276123, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.16556568443775177, "step": 902 }, { "epoch": 0.013483750065328247, "grad_norm": 0.345703125, "grad_norm_var": 0.009677871068318685, "learning_rate": 0.0001, "loss": 1.481, "loss/crossentropy": 2.8748459815979004, "loss/fcd": 1.3203125, "loss/idx": 5.5, "loss/logits": 0.1606980860233307, "step": 903 }, { "epoch": 0.013498682235943228, "grad_norm": 0.34765625, "grad_norm_var": 0.009910011291503906, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.6527421474456787, "loss/fcd": 1.33984375, "loss/idx": 5.5, "loss/logits": 0.1633721962571144, "step": 904 }, { "epoch": 0.01351361440655821, "grad_norm": 0.33984375, "grad_norm_var": 0.009838724136352539, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.3300899267196655, "loss/fcd": 1.37890625, "loss/idx": 5.5, "loss/logits": 0.16493894159793854, "step": 905 }, { "epoch": 0.01352854657717319, "grad_norm": 0.328125, "grad_norm_var": 0.0016100406646728516, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.485817790031433, "loss/fcd": 1.30078125, "loss/idx": 5.5, "loss/logits": 0.15857402980327606, "step": 906 }, { "epoch": 0.013543478747788173, "grad_norm": 0.412109375, "grad_norm_var": 0.0013697147369384766, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.596954822540283, "loss/fcd": 1.44921875, "loss/idx": 5.5, "loss/logits": 0.19229594618082047, "step": 907 }, { "epoch": 0.013558410918403154, "grad_norm": 0.380859375, "grad_norm_var": 0.0013642470041910807, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.5879993438720703, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.18044909089803696, "step": 908 }, { "epoch": 0.013573343089018135, "grad_norm": 0.359375, "grad_norm_var": 0.0013773600260416667, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.525175929069519, "loss/fcd": 1.43359375, "loss/idx": 5.5, "loss/logits": 0.20643934607505798, "step": 909 }, { "epoch": 0.013588275259633116, "grad_norm": 0.34375, "grad_norm_var": 0.00143736203511556, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.763263463973999, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.16232506185770035, "step": 910 }, { "epoch": 0.013603207430248098, "grad_norm": 0.39453125, "grad_norm_var": 0.0013284683227539062, "learning_rate": 0.0001, "loss": 1.5742, "loss/crossentropy": 2.6070960760116577, "loss/fcd": 1.40625, "loss/idx": 5.5, "loss/logits": 0.16791047900915146, "step": 911 }, { "epoch": 0.01361813960086308, "grad_norm": 0.380859375, "grad_norm_var": 0.0007669925689697266, "learning_rate": 0.0001, "loss": 1.6868, "loss/crossentropy": 2.4819244146347046, "loss/fcd": 1.49609375, "loss/idx": 5.5, "loss/logits": 0.1907288283109665, "step": 912 }, { "epoch": 0.01363307177147806, "grad_norm": 0.396484375, "grad_norm_var": 0.0007542769114176432, "learning_rate": 0.0001, "loss": 1.5673, "loss/crossentropy": 2.4561848640441895, "loss/fcd": 1.3984375, "loss/idx": 5.5, "loss/logits": 0.1689068078994751, "step": 913 }, { "epoch": 0.013648003942093043, "grad_norm": 0.50390625, "grad_norm_var": 0.0016995588938395181, "learning_rate": 0.0001, "loss": 1.7706, "loss/crossentropy": 2.5172276496887207, "loss/fcd": 1.52734375, "loss/idx": 5.5, "loss/logits": 0.24328875541687012, "step": 914 }, { "epoch": 0.013662936112708024, "grad_norm": 0.3046875, "grad_norm_var": 0.002045424779256185, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.4740447998046875, "loss/fcd": 1.3359375, "loss/idx": 5.5, "loss/logits": 0.1733308956027031, "step": 915 }, { "epoch": 0.013677868283323005, "grad_norm": 0.443359375, "grad_norm_var": 0.0023396809895833335, "learning_rate": 0.0001, "loss": 1.8087, "loss/crossentropy": 2.6645225286483765, "loss/fcd": 1.56640625, "loss/idx": 5.5, "loss/logits": 0.24224933236837387, "step": 916 }, { "epoch": 0.013692800453937986, "grad_norm": 0.453125, "grad_norm_var": 0.0026729424794514974, "learning_rate": 0.0001, "loss": 1.7391, "loss/crossentropy": 2.4171453714370728, "loss/fcd": 1.53515625, "loss/idx": 5.5, "loss/logits": 0.20390180498361588, "step": 917 }, { "epoch": 0.013707732624552969, "grad_norm": 0.37890625, "grad_norm_var": 0.0026732762654622395, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.8347936868667603, "loss/fcd": 1.375, "loss/idx": 5.5, "loss/logits": 0.17344320565462112, "step": 918 }, { "epoch": 0.01372266479516795, "grad_norm": 0.3984375, "grad_norm_var": 0.0025913079579671225, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.494508743286133, "loss/fcd": 1.34765625, "loss/idx": 5.5, "loss/logits": 0.1818997785449028, "step": 919 }, { "epoch": 0.01373759696578293, "grad_norm": 0.39453125, "grad_norm_var": 0.002492888768513997, "learning_rate": 0.0001, "loss": 1.6563, "loss/crossentropy": 2.2331273555755615, "loss/fcd": 1.46875, "loss/idx": 5.5, "loss/logits": 0.1875598356127739, "step": 920 }, { "epoch": 0.013752529136397913, "grad_norm": 0.333984375, "grad_norm_var": 0.0025328954060872396, "learning_rate": 0.0001, "loss": 1.5313, "loss/crossentropy": 2.591843605041504, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.16802766174077988, "step": 921 }, { "epoch": 0.013767461307012894, "grad_norm": 0.33203125, "grad_norm_var": 0.002502695719401042, "learning_rate": 0.0001, "loss": 1.6131, "loss/crossentropy": 2.5554168224334717, "loss/fcd": 1.41796875, "loss/idx": 5.5, "loss/logits": 0.19509856402873993, "step": 922 }, { "epoch": 0.013782393477627875, "grad_norm": 0.3046875, "grad_norm_var": 0.0028812249501546225, "learning_rate": 0.0001, "loss": 1.4442, "loss/crossentropy": 2.4917492866516113, "loss/fcd": 1.2890625, "loss/idx": 5.5, "loss/logits": 0.1551196053624153, "step": 923 }, { "epoch": 0.013797325648242856, "grad_norm": 0.345703125, "grad_norm_var": 0.0029613335927327475, "learning_rate": 0.0001, "loss": 1.4584, "loss/crossentropy": 2.6141878366470337, "loss/fcd": 1.30078125, "loss/idx": 5.5, "loss/logits": 0.15758418291807175, "step": 924 }, { "epoch": 0.013812257818857839, "grad_norm": 0.478515625, "grad_norm_var": 0.00353240966796875, "learning_rate": 0.0001, "loss": 1.771, "loss/crossentropy": 2.5590325593948364, "loss/fcd": 1.5625, "loss/idx": 5.5, "loss/logits": 0.20846740901470184, "step": 925 }, { "epoch": 0.01382718998947282, "grad_norm": 0.341796875, "grad_norm_var": 0.0035438378651936847, "learning_rate": 0.0001, "loss": 1.5279, "loss/crossentropy": 2.688356399536133, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.16853488981723785, "step": 926 }, { "epoch": 0.0138421221600878, "grad_norm": 0.36328125, "grad_norm_var": 0.003571812311808268, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.627697229385376, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.1708643138408661, "step": 927 }, { "epoch": 0.013857054330702783, "grad_norm": 0.484375, "grad_norm_var": 0.004189300537109375, "learning_rate": 0.0001, "loss": 1.7863, "loss/crossentropy": 2.4670333862304688, "loss/fcd": 1.5859375, "loss/idx": 5.5, "loss/logits": 0.2003796547651291, "step": 928 }, { "epoch": 0.013871986501317764, "grad_norm": 1.3671875, "grad_norm_var": 0.06377600034077963, "learning_rate": 0.0001, "loss": 1.8487, "loss/crossentropy": 2.5073784589767456, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.20029612630605698, "step": 929 }, { "epoch": 0.013886918671932745, "grad_norm": 2.390625, "grad_norm_var": 0.2993701775868734, "learning_rate": 0.0001, "loss": 1.9457, "loss/crossentropy": 2.699519634246826, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.2152162715792656, "step": 930 }, { "epoch": 0.013901850842547726, "grad_norm": 1.671875, "grad_norm_var": 0.3678853193918864, "learning_rate": 0.0001, "loss": 1.9107, "loss/crossentropy": 2.436430335044861, "loss/fcd": 1.71484375, "loss/idx": 6.0, "loss/logits": 0.19587621092796326, "step": 931 }, { "epoch": 0.013916783013162709, "grad_norm": 1.1875, "grad_norm_var": 0.3814806620279948, "learning_rate": 0.0001, "loss": 1.8237, "loss/crossentropy": 2.57407009601593, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.1987301930785179, "step": 932 }, { "epoch": 0.01393171518377769, "grad_norm": 1.2109375, "grad_norm_var": 0.3922607421875, "learning_rate": 0.0001, "loss": 2.1566, "loss/crossentropy": 2.553762674331665, "loss/fcd": 1.88671875, "loss/idx": 6.0, "loss/logits": 0.26991352438926697, "step": 933 }, { "epoch": 0.013946647354392671, "grad_norm": 0.9375, "grad_norm_var": 0.384196408589681, "learning_rate": 0.0001, "loss": 2.0457, "loss/crossentropy": 2.5711495876312256, "loss/fcd": 1.81640625, "loss/idx": 6.0, "loss/logits": 0.22925003618001938, "step": 934 }, { "epoch": 0.013961579525007652, "grad_norm": 0.76171875, "grad_norm_var": 0.3737721761067708, "learning_rate": 0.0001, "loss": 1.7447, "loss/crossentropy": 2.5488909482955933, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.19389048963785172, "step": 935 }, { "epoch": 0.013976511695622635, "grad_norm": 0.8671875, "grad_norm_var": 0.3617634455362956, "learning_rate": 0.0001, "loss": 1.9592, "loss/crossentropy": 2.635095238685608, "loss/fcd": 1.74609375, "loss/idx": 6.0, "loss/logits": 0.21313950419425964, "step": 936 }, { "epoch": 0.013991443866237616, "grad_norm": 0.7109375, "grad_norm_var": 0.3454036553700765, "learning_rate": 0.0001, "loss": 1.7995, "loss/crossentropy": 2.693148612976074, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.19793272763490677, "step": 937 }, { "epoch": 0.014006376036852597, "grad_norm": 0.7421875, "grad_norm_var": 0.3270587762196859, "learning_rate": 0.0001, "loss": 1.8292, "loss/crossentropy": 2.7530752420425415, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.20420175790786743, "step": 938 }, { "epoch": 0.01402130820746758, "grad_norm": 0.7109375, "grad_norm_var": 0.30591975847880043, "learning_rate": 0.0001, "loss": 1.9451, "loss/crossentropy": 2.7189706563949585, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.2419990748167038, "step": 939 }, { "epoch": 0.01403624037808256, "grad_norm": 0.5859375, "grad_norm_var": 0.2914271036783854, "learning_rate": 0.0001, "loss": 1.9409, "loss/crossentropy": 2.3447235822677612, "loss/fcd": 1.71875, "loss/idx": 6.0, "loss/logits": 0.22218556702136993, "step": 940 }, { "epoch": 0.014051172548697541, "grad_norm": 0.65625, "grad_norm_var": 0.28280218442281085, "learning_rate": 0.0001, "loss": 2.0273, "loss/crossentropy": 2.694061040878296, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.23818911612033844, "step": 941 }, { "epoch": 0.014066104719312522, "grad_norm": 0.6484375, "grad_norm_var": 0.2643483479817708, "learning_rate": 0.0001, "loss": 1.82, "loss/crossentropy": 2.6287845373153687, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.21457893401384354, "step": 942 }, { "epoch": 0.014081036889927505, "grad_norm": 0.95703125, "grad_norm_var": 0.23945414225260417, "learning_rate": 0.0001, "loss": 2.2185, "loss/crossentropy": 2.373252034187317, "loss/fcd": 1.92578125, "loss/idx": 6.0, "loss/logits": 0.2926865443587303, "step": 943 }, { "epoch": 0.014095969060542486, "grad_norm": 0.66015625, "grad_norm_var": 0.22946058909098307, "learning_rate": 0.0001, "loss": 2.0388, "loss/crossentropy": 2.5666359663009644, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.24973313510417938, "step": 944 }, { "epoch": 0.014110901231157467, "grad_norm": 0.53515625, "grad_norm_var": 0.23245340983072918, "learning_rate": 0.0001, "loss": 1.6877, "loss/crossentropy": 2.591952323913574, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.17602194100618362, "step": 945 }, { "epoch": 0.01412583340177245, "grad_norm": 0.6796875, "grad_norm_var": 0.08725763956705729, "learning_rate": 0.0001, "loss": 2.0353, "loss/crossentropy": 2.3668471574783325, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.24623292684555054, "step": 946 }, { "epoch": 0.01414076557238743, "grad_norm": 0.4921875, "grad_norm_var": 0.044209798177083336, "learning_rate": 0.0001, "loss": 1.6808, "loss/crossentropy": 2.7240748405456543, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.17297939211130142, "step": 947 }, { "epoch": 0.014155697743002411, "grad_norm": 0.56640625, "grad_norm_var": 0.03386834462483724, "learning_rate": 0.0001, "loss": 1.7939, "loss/crossentropy": 2.401008129119873, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.18840720504522324, "step": 948 }, { "epoch": 0.014170629913617392, "grad_norm": 0.57421875, "grad_norm_var": 0.01860326131184896, "learning_rate": 0.0001, "loss": 1.9822, "loss/crossentropy": 2.2320332527160645, "loss/fcd": 1.7734375, "loss/idx": 6.0, "loss/logits": 0.20871728658676147, "step": 949 }, { "epoch": 0.014185562084232375, "grad_norm": 0.6171875, "grad_norm_var": 0.014568074544270834, "learning_rate": 0.0001, "loss": 1.9696, "loss/crossentropy": 2.5282981395721436, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.23909948021173477, "step": 950 }, { "epoch": 0.014200494254847356, "grad_norm": 0.72265625, "grad_norm_var": 0.014200592041015625, "learning_rate": 0.0001, "loss": 2.0531, "loss/crossentropy": 2.437385082244873, "loss/fcd": 1.8203125, "loss/idx": 6.0, "loss/logits": 0.23282259702682495, "step": 951 }, { "epoch": 0.014215426425462337, "grad_norm": 0.61328125, "grad_norm_var": 0.011568133036295574, "learning_rate": 0.0001, "loss": 1.8507, "loss/crossentropy": 2.450563669204712, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.20221839100122452, "step": 952 }, { "epoch": 0.014230358596077318, "grad_norm": 0.474609375, "grad_norm_var": 0.013281742731730143, "learning_rate": 0.0001, "loss": 1.6715, "loss/crossentropy": 2.5514947175979614, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.17935138195753098, "step": 953 }, { "epoch": 0.0142452907666923, "grad_norm": 0.58984375, "grad_norm_var": 0.012651936213175455, "learning_rate": 0.0001, "loss": 1.9562, "loss/crossentropy": 2.6119589805603027, "loss/fcd": 1.72265625, "loss/idx": 6.0, "loss/logits": 0.23356223851442337, "step": 954 }, { "epoch": 0.014260222937307282, "grad_norm": 0.609375, "grad_norm_var": 0.01220396359761556, "learning_rate": 0.0001, "loss": 1.724, "loss/crossentropy": 2.741728901863098, "loss/fcd": 1.5390625, "loss/idx": 6.0, "loss/logits": 0.1849788874387741, "step": 955 }, { "epoch": 0.014275155107922263, "grad_norm": 0.53515625, "grad_norm_var": 0.012622181574503582, "learning_rate": 0.0001, "loss": 1.7135, "loss/crossentropy": 2.7166521549224854, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.17837309837341309, "step": 956 }, { "epoch": 0.014290087278537245, "grad_norm": 0.55078125, "grad_norm_var": 0.012817875544230143, "learning_rate": 0.0001, "loss": 1.7684, "loss/crossentropy": 2.659182906150818, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.1941850259900093, "step": 957 }, { "epoch": 0.014305019449152226, "grad_norm": 0.65625, "grad_norm_var": 0.012857421239217123, "learning_rate": 0.0001, "loss": 1.9042, "loss/crossentropy": 2.512916922569275, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20891183614730835, "step": 958 }, { "epoch": 0.014319951619767207, "grad_norm": 0.490234375, "grad_norm_var": 0.0051648457845052086, "learning_rate": 0.0001, "loss": 1.6953, "loss/crossentropy": 2.5976825952529907, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.1796964481472969, "step": 959 }, { "epoch": 0.014334883790382188, "grad_norm": 0.7109375, "grad_norm_var": 0.0058318456013997395, "learning_rate": 0.0001, "loss": 2.0133, "loss/crossentropy": 2.5360227823257446, "loss/fcd": 1.7734375, "loss/idx": 6.0, "loss/logits": 0.23983266949653625, "step": 960 }, { "epoch": 0.014349815960997171, "grad_norm": 0.478515625, "grad_norm_var": 0.00643614133199056, "learning_rate": 0.0001, "loss": 1.606, "loss/crossentropy": 2.5436811447143555, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.16457198560237885, "step": 961 }, { "epoch": 0.014364748131612152, "grad_norm": 0.466796875, "grad_norm_var": 0.006583404541015625, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.638171434402466, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.17533918470144272, "step": 962 }, { "epoch": 0.014379680302227133, "grad_norm": 0.447265625, "grad_norm_var": 0.007186237970987956, "learning_rate": 0.0001, "loss": 1.6373, "loss/crossentropy": 2.565574049949646, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.17632722109556198, "step": 963 }, { "epoch": 0.014394612472842115, "grad_norm": 0.46484375, "grad_norm_var": 0.007865635553995769, "learning_rate": 0.0001, "loss": 1.682, "loss/crossentropy": 2.6288174390792847, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.17416883260011673, "step": 964 }, { "epoch": 0.014409544643457096, "grad_norm": 0.55859375, "grad_norm_var": 0.00785673459370931, "learning_rate": 0.0001, "loss": 1.9074, "loss/crossentropy": 2.8099989891052246, "loss/fcd": 1.6875, "loss/idx": 6.0, "loss/logits": 0.21986465901136398, "step": 965 }, { "epoch": 0.014424476814072077, "grad_norm": 0.5234375, "grad_norm_var": 0.00771177609761556, "learning_rate": 0.0001, "loss": 1.6413, "loss/crossentropy": 2.626309394836426, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.1725175604224205, "step": 966 }, { "epoch": 0.014439408984687058, "grad_norm": 0.5703125, "grad_norm_var": 0.0057727654774983725, "learning_rate": 0.0001, "loss": 1.8253, "loss/crossentropy": 2.5631070137023926, "loss/fcd": 1.62109375, "loss/idx": 6.0, "loss/logits": 0.20422351360321045, "step": 967 }, { "epoch": 0.014454341155302041, "grad_norm": 0.466796875, "grad_norm_var": 0.005804951985677083, "learning_rate": 0.0001, "loss": 1.6936, "loss/crossentropy": 2.622004508972168, "loss/fcd": 1.50390625, "loss/idx": 6.0, "loss/logits": 0.18973329663276672, "step": 968 }, { "epoch": 0.014469273325917022, "grad_norm": 0.482421875, "grad_norm_var": 0.005743662516276042, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.339906692504883, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.15414663404226303, "step": 969 }, { "epoch": 0.014484205496532003, "grad_norm": 0.484375, "grad_norm_var": 0.005704180399576823, "learning_rate": 0.0001, "loss": 1.7258, "loss/crossentropy": 2.5720293521881104, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.19454781711101532, "step": 970 }, { "epoch": 0.014499137667146984, "grad_norm": 0.45703125, "grad_norm_var": 0.005562845865885417, "learning_rate": 0.0001, "loss": 1.6937, "loss/crossentropy": 2.547956943511963, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.17421600222587585, "step": 971 }, { "epoch": 0.014514069837761967, "grad_norm": 0.486328125, "grad_norm_var": 0.00562284787495931, "learning_rate": 0.0001, "loss": 1.7068, "loss/crossentropy": 2.445081949234009, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18333792686462402, "step": 972 }, { "epoch": 0.014529002008376948, "grad_norm": 0.59765625, "grad_norm_var": 0.005962355931599935, "learning_rate": 0.0001, "loss": 2.2223, "loss/crossentropy": 2.6638505458831787, "loss/fcd": 1.91015625, "loss/idx": 6.0, "loss/logits": 0.3120998740196228, "step": 973 }, { "epoch": 0.014543934178991929, "grad_norm": 0.486328125, "grad_norm_var": 0.0047108968098958336, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.534387230873108, "loss/fcd": 1.55859375, "loss/idx": 6.0, "loss/logits": 0.17531096935272217, "step": 974 }, { "epoch": 0.014558866349606911, "grad_norm": 0.5546875, "grad_norm_var": 0.00479429562886556, "learning_rate": 0.0001, "loss": 1.7092, "loss/crossentropy": 2.6609139442443848, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18571606278419495, "step": 975 }, { "epoch": 0.014573798520221892, "grad_norm": 0.48828125, "grad_norm_var": 0.00206907590230306, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.5186537504196167, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.1823228821158409, "step": 976 }, { "epoch": 0.014588730690836873, "grad_norm": 0.53125, "grad_norm_var": 0.0020858128865559895, "learning_rate": 0.0001, "loss": 1.913, "loss/crossentropy": 2.3601412773132324, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.21770965307950974, "step": 977 }, { "epoch": 0.014603662861451854, "grad_norm": 0.46484375, "grad_norm_var": 0.002095778783162435, "learning_rate": 0.0001, "loss": 1.8019, "loss/crossentropy": 2.6870713233947754, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.2198324054479599, "step": 978 }, { "epoch": 0.014618595032066837, "grad_norm": 0.5, "grad_norm_var": 0.0018704732259114583, "learning_rate": 0.0001, "loss": 1.8232, "loss/crossentropy": 2.392784595489502, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.1981583684682846, "step": 979 }, { "epoch": 0.014633527202681818, "grad_norm": 0.4921875, "grad_norm_var": 0.0017623265584309896, "learning_rate": 0.0001, "loss": 1.8108, "loss/crossentropy": 2.4014596939086914, "loss/fcd": 1.609375, "loss/idx": 6.0, "loss/logits": 0.20145010948181152, "step": 980 }, { "epoch": 0.014648459373296799, "grad_norm": 0.6328125, "grad_norm_var": 0.0025970458984375, "learning_rate": 0.0001, "loss": 1.8404, "loss/crossentropy": 2.622117042541504, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.19979581236839294, "step": 981 }, { "epoch": 0.014663391543911782, "grad_norm": 0.515625, "grad_norm_var": 0.0025906880696614583, "learning_rate": 0.0001, "loss": 1.9272, "loss/crossentropy": 2.487557888031006, "loss/fcd": 1.68359375, "loss/idx": 6.0, "loss/logits": 0.2436397820711136, "step": 982 }, { "epoch": 0.014678323714526762, "grad_norm": 0.484375, "grad_norm_var": 0.0023976643880208332, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.512000560760498, "loss/fcd": 1.48828125, "loss/idx": 6.0, "loss/logits": 0.18184158951044083, "step": 983 }, { "epoch": 0.014693255885141743, "grad_norm": 0.490234375, "grad_norm_var": 0.002303822835286458, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.630392551422119, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.16384021937847137, "step": 984 }, { "epoch": 0.014708188055756724, "grad_norm": 0.4609375, "grad_norm_var": 0.0024096012115478516, "learning_rate": 0.0001, "loss": 1.6285, "loss/crossentropy": 2.629890561103821, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.16755110025405884, "step": 985 }, { "epoch": 0.014723120226371707, "grad_norm": 0.53125, "grad_norm_var": 0.0023996829986572266, "learning_rate": 0.0001, "loss": 1.9361, "loss/crossentropy": 2.5041611194610596, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.23302249610424042, "step": 986 }, { "epoch": 0.014738052396986688, "grad_norm": 0.52734375, "grad_norm_var": 0.0022039890289306642, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.6311824321746826, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.14749648422002792, "step": 987 }, { "epoch": 0.014752984567601669, "grad_norm": 0.498046875, "grad_norm_var": 0.002167367935180664, "learning_rate": 0.0001, "loss": 1.6699, "loss/crossentropy": 2.3241487741470337, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.1698528677225113, "step": 988 }, { "epoch": 0.014767916738216652, "grad_norm": 0.5234375, "grad_norm_var": 0.001703500747680664, "learning_rate": 0.0001, "loss": 1.7241, "loss/crossentropy": 2.490164041519165, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.18110015988349915, "step": 989 }, { "epoch": 0.014782848908831633, "grad_norm": 0.44140625, "grad_norm_var": 0.0019795099894205728, "learning_rate": 0.0001, "loss": 1.6575, "loss/crossentropy": 2.446812629699707, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.18092559278011322, "step": 990 }, { "epoch": 0.014797781079446614, "grad_norm": 0.52734375, "grad_norm_var": 0.0018580118815104167, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.4171838760375977, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.1608925387263298, "step": 991 }, { "epoch": 0.014812713250061595, "grad_norm": 0.4921875, "grad_norm_var": 0.0018493016560872397, "learning_rate": 0.0001, "loss": 1.6547, "loss/crossentropy": 2.6258944272994995, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.1859017238020897, "step": 992 }, { "epoch": 0.014827645420676577, "grad_norm": 0.5078125, "grad_norm_var": 0.0018081029256184896, "learning_rate": 0.0001, "loss": 1.6637, "loss/crossentropy": 2.641685724258423, "loss/fcd": 1.484375, "loss/idx": 6.0, "loss/logits": 0.17929885536432266, "step": 993 }, { "epoch": 0.014842577591291558, "grad_norm": 0.640625, "grad_norm_var": 0.0027837117513020834, "learning_rate": 0.0001, "loss": 1.7782, "loss/crossentropy": 2.4380099773406982, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.1844642013311386, "step": 994 }, { "epoch": 0.01485750976190654, "grad_norm": 0.53515625, "grad_norm_var": 0.0027831395467122397, "learning_rate": 0.0001, "loss": 1.8351, "loss/crossentropy": 2.3071682453155518, "loss/fcd": 1.62109375, "loss/idx": 6.0, "loss/logits": 0.2140112966299057, "step": 995 }, { "epoch": 0.01487244193252152, "grad_norm": 0.48046875, "grad_norm_var": 0.0028333028157552084, "learning_rate": 0.0001, "loss": 1.6957, "loss/crossentropy": 2.662778615951538, "loss/fcd": 1.50390625, "loss/idx": 6.0, "loss/logits": 0.19180986285209656, "step": 996 }, { "epoch": 0.014887374103136503, "grad_norm": 0.4375, "grad_norm_var": 0.00222930908203125, "learning_rate": 0.0001, "loss": 1.7745, "loss/crossentropy": 2.654296398162842, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.2119828313589096, "step": 997 }, { "epoch": 0.014902306273751484, "grad_norm": 0.5, "grad_norm_var": 0.002224222819010417, "learning_rate": 0.0001, "loss": 1.6723, "loss/crossentropy": 2.4404503107070923, "loss/fcd": 1.49609375, "loss/idx": 6.0, "loss/logits": 0.17621850222349167, "step": 998 }, { "epoch": 0.014917238444366465, "grad_norm": 0.53125, "grad_norm_var": 0.0022333780924479168, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.5146583318710327, "loss/fcd": 1.37890625, "loss/idx": 6.0, "loss/logits": 0.14269014447927475, "step": 999 }, { "epoch": 0.014932170614981448, "grad_norm": 0.49609375, "grad_norm_var": 0.0022217909495035808, "learning_rate": 0.0001, "loss": 1.7702, "loss/crossentropy": 2.4616737365722656, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.20378455519676208, "step": 1000 }, { "epoch": 0.014947102785596428, "grad_norm": 0.58984375, "grad_norm_var": 0.0024483839670817057, "learning_rate": 0.0001, "loss": 1.9032, "loss/crossentropy": 2.3614426851272583, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20791510492563248, "step": 1001 }, { "epoch": 0.01496203495621141, "grad_norm": 0.5234375, "grad_norm_var": 0.002436558405558268, "learning_rate": 0.0001, "loss": 1.5926, "loss/crossentropy": 2.6121147871017456, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.16682759672403336, "step": 1002 }, { "epoch": 0.01497696712682639, "grad_norm": 0.50390625, "grad_norm_var": 0.0024346510569254556, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.5289264917373657, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.16205105185508728, "step": 1003 }, { "epoch": 0.014991899297441373, "grad_norm": 0.4921875, "grad_norm_var": 0.002449480692545573, "learning_rate": 0.0001, "loss": 1.5878, "loss/crossentropy": 2.5034509897232056, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.1541755199432373, "step": 1004 }, { "epoch": 0.015006831468056354, "grad_norm": 0.474609375, "grad_norm_var": 0.0025365034739176433, "learning_rate": 0.0001, "loss": 1.6604, "loss/crossentropy": 2.5246294736862183, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.18387333303689957, "step": 1005 }, { "epoch": 0.015021763638671335, "grad_norm": 0.58984375, "grad_norm_var": 0.002538919448852539, "learning_rate": 0.0001, "loss": 1.9237, "loss/crossentropy": 2.468030333518982, "loss/fcd": 1.70703125, "loss/idx": 6.0, "loss/logits": 0.21670957654714584, "step": 1006 }, { "epoch": 0.015036695809286318, "grad_norm": 0.51171875, "grad_norm_var": 0.002539173762003581, "learning_rate": 0.0001, "loss": 1.8183, "loss/crossentropy": 2.726784110069275, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.21673081070184708, "step": 1007 }, { "epoch": 0.015051627979901299, "grad_norm": 0.4921875, "grad_norm_var": 0.002539173762003581, "learning_rate": 0.0001, "loss": 1.7091, "loss/crossentropy": 2.7661021947860718, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18564757704734802, "step": 1008 }, { "epoch": 0.01506656015051628, "grad_norm": 0.5859375, "grad_norm_var": 0.002802387873331706, "learning_rate": 0.0001, "loss": 1.9547, "loss/crossentropy": 2.5440393686294556, "loss/fcd": 1.7265625, "loss/idx": 6.0, "loss/logits": 0.22810395061969757, "step": 1009 }, { "epoch": 0.01508149232113126, "grad_norm": 0.5234375, "grad_norm_var": 0.0018391768137613932, "learning_rate": 0.0001, "loss": 1.8354, "loss/crossentropy": 2.780647039413452, "loss/fcd": 1.609375, "loss/idx": 6.0, "loss/logits": 0.2260511964559555, "step": 1010 }, { "epoch": 0.015096424491746243, "grad_norm": 0.53515625, "grad_norm_var": 0.0018391768137613932, "learning_rate": 0.0001, "loss": 2.0157, "loss/crossentropy": 2.3090795278549194, "loss/fcd": 1.77734375, "loss/idx": 6.0, "loss/logits": 0.23833715170621872, "step": 1011 }, { "epoch": 0.015111356662361224, "grad_norm": 0.52734375, "grad_norm_var": 0.0017499128977457683, "learning_rate": 0.0001, "loss": 2.0326, "loss/crossentropy": 2.5182803869247437, "loss/fcd": 1.77734375, "loss/idx": 6.0, "loss/logits": 0.2552146390080452, "step": 1012 }, { "epoch": 0.015126288832976205, "grad_norm": 0.443359375, "grad_norm_var": 0.0016878763834635416, "learning_rate": 0.0001, "loss": 1.6628, "loss/crossentropy": 2.5835026502609253, "loss/fcd": 1.47265625, "loss/idx": 6.0, "loss/logits": 0.190179705619812, "step": 1013 }, { "epoch": 0.015141221003591186, "grad_norm": 0.439453125, "grad_norm_var": 0.0020786126454671225, "learning_rate": 0.0001, "loss": 1.6838, "loss/crossentropy": 2.706833243370056, "loss/fcd": 1.48828125, "loss/idx": 6.0, "loss/logits": 0.19552963227033615, "step": 1014 }, { "epoch": 0.015156153174206169, "grad_norm": 1.046875, "grad_norm_var": 0.019727691014607748, "learning_rate": 0.0001, "loss": 2.2195, "loss/crossentropy": 2.607192873954773, "loss/fcd": 1.8984375, "loss/idx": 6.0, "loss/logits": 0.3210318982601166, "step": 1015 }, { "epoch": 0.01517108534482115, "grad_norm": 0.55078125, "grad_norm_var": 0.01953275998433431, "learning_rate": 0.0001, "loss": 1.7991, "loss/crossentropy": 2.596633195877075, "loss/fcd": 1.5703125, "loss/idx": 6.0, "loss/logits": 0.22876836359500885, "step": 1016 }, { "epoch": 0.01518601751543613, "grad_norm": 0.45703125, "grad_norm_var": 0.01996293067932129, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.566289782524109, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1755644902586937, "step": 1017 }, { "epoch": 0.015200949686051114, "grad_norm": 0.58203125, "grad_norm_var": 0.020020151138305665, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.5668020248413086, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.17528380453586578, "step": 1018 }, { "epoch": 0.015215881856666095, "grad_norm": 0.6015625, "grad_norm_var": 0.020051940282185873, "learning_rate": 0.0001, "loss": 1.7784, "loss/crossentropy": 2.85185706615448, "loss/fcd": 1.5859375, "loss/idx": 6.0, "loss/logits": 0.19249311834573746, "step": 1019 }, { "epoch": 0.015230814027281075, "grad_norm": 0.515625, "grad_norm_var": 0.019895156224568684, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.5789307355880737, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.20993077754974365, "step": 1020 }, { "epoch": 0.015245746197896056, "grad_norm": 0.42578125, "grad_norm_var": 0.020566304524739582, "learning_rate": 0.0001, "loss": 1.5714, "loss/crossentropy": 2.543707847595215, "loss/fcd": 1.41796875, "loss/idx": 6.0, "loss/logits": 0.15347032248973846, "step": 1021 }, { "epoch": 0.01526067836851104, "grad_norm": 0.4765625, "grad_norm_var": 0.02079308827718099, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.784927010536194, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.1873757317662239, "step": 1022 }, { "epoch": 0.01527561053912602, "grad_norm": 0.5625, "grad_norm_var": 0.020731099446614585, "learning_rate": 0.0001, "loss": 1.7326, "loss/crossentropy": 2.5929077863693237, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.18176910281181335, "step": 1023 }, { "epoch": 0.015290542709741001, "grad_norm": 0.57421875, "grad_norm_var": 0.02054284413655599, "learning_rate": 0.0001, "loss": 1.8381, "loss/crossentropy": 2.612320065498352, "loss/fcd": 1.62890625, "loss/idx": 6.0, "loss/logits": 0.20919139683246613, "step": 1024 }, { "epoch": 0.015305474880355984, "grad_norm": 0.54296875, "grad_norm_var": 0.02046941121419271, "learning_rate": 0.0001, "loss": 1.7577, "loss/crossentropy": 2.4743690490722656, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.19128717482089996, "step": 1025 }, { "epoch": 0.015320407050970965, "grad_norm": 0.5234375, "grad_norm_var": 0.02046941121419271, "learning_rate": 0.0001, "loss": 1.7694, "loss/crossentropy": 2.4436780214309692, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.21474337577819824, "step": 1026 }, { "epoch": 0.015335339221585946, "grad_norm": 0.55078125, "grad_norm_var": 0.020453135172526043, "learning_rate": 0.0001, "loss": 1.9482, "loss/crossentropy": 2.4892067909240723, "loss/fcd": 1.70703125, "loss/idx": 6.0, "loss/logits": 0.24121662974357605, "step": 1027 }, { "epoch": 0.015350271392200927, "grad_norm": 0.53125, "grad_norm_var": 0.020441627502441405, "learning_rate": 0.0001, "loss": 1.8773, "loss/crossentropy": 2.4016976356506348, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.23671425879001617, "step": 1028 }, { "epoch": 0.01536520356281591, "grad_norm": 0.50390625, "grad_norm_var": 0.019797627131144205, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.839547872543335, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.2044782042503357, "step": 1029 }, { "epoch": 0.01538013573343089, "grad_norm": 0.515625, "grad_norm_var": 0.01898371378580729, "learning_rate": 0.0001, "loss": 1.8493, "loss/crossentropy": 2.322245240211487, "loss/fcd": 1.63671875, "loss/idx": 6.0, "loss/logits": 0.21256640553474426, "step": 1030 }, { "epoch": 0.015395067904045871, "grad_norm": 0.498046875, "grad_norm_var": 0.002185678482055664, "learning_rate": 0.0001, "loss": 1.6567, "loss/crossentropy": 2.486846685409546, "loss/fcd": 1.48046875, "loss/idx": 6.0, "loss/logits": 0.17625004798173904, "step": 1031 }, { "epoch": 0.015410000074660852, "grad_norm": 0.62109375, "grad_norm_var": 0.0027292728424072265, "learning_rate": 0.0001, "loss": 2.055, "loss/crossentropy": 2.6589245796203613, "loss/fcd": 1.80859375, "loss/idx": 6.0, "loss/logits": 0.2464069500565529, "step": 1032 }, { "epoch": 0.015424932245275835, "grad_norm": 0.484375, "grad_norm_var": 0.0025094191233317057, "learning_rate": 0.0001, "loss": 1.6775, "loss/crossentropy": 2.2486242055892944, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.16964885592460632, "step": 1033 }, { "epoch": 0.015439864415890816, "grad_norm": 0.54296875, "grad_norm_var": 0.002343479792277018, "learning_rate": 0.0001, "loss": 1.7148, "loss/crossentropy": 2.586572051048279, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.19529356062412262, "step": 1034 }, { "epoch": 0.015454796586505797, "grad_norm": 0.58984375, "grad_norm_var": 0.002239338556925456, "learning_rate": 0.0001, "loss": 1.8468, "loss/crossentropy": 2.714082717895508, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.22178450226783752, "step": 1035 }, { "epoch": 0.01546972875712078, "grad_norm": 0.47265625, "grad_norm_var": 0.0024295647939046225, "learning_rate": 0.0001, "loss": 1.7294, "loss/crossentropy": 2.320715546607971, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.19423359632492065, "step": 1036 }, { "epoch": 0.01548466092773576, "grad_norm": 0.53515625, "grad_norm_var": 0.001715707778930664, "learning_rate": 0.0001, "loss": 1.7886, "loss/crossentropy": 2.7388995885849, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.21433691680431366, "step": 1037 }, { "epoch": 0.015499593098350741, "grad_norm": 0.49609375, "grad_norm_var": 0.00159300168355306, "learning_rate": 0.0001, "loss": 1.7216, "loss/crossentropy": 2.627174973487854, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.19816239923238754, "step": 1038 }, { "epoch": 0.015514525268965722, "grad_norm": 0.578125, "grad_norm_var": 0.0016675154368082682, "learning_rate": 0.0001, "loss": 1.708, "loss/crossentropy": 2.2871665954589844, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.1728159263730049, "step": 1039 }, { "epoch": 0.015529457439580705, "grad_norm": 0.5546875, "grad_norm_var": 0.0015893141428629557, "learning_rate": 0.0001, "loss": 1.6132, "loss/crossentropy": 2.602549910545349, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1717797815799713, "step": 1040 }, { "epoch": 0.015544389610195686, "grad_norm": 0.5546875, "grad_norm_var": 0.0016122023264567057, "learning_rate": 0.0001, "loss": 1.7748, "loss/crossentropy": 2.658362030982971, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.22009392827749252, "step": 1041 }, { "epoch": 0.015559321780810667, "grad_norm": 0.45703125, "grad_norm_var": 0.0019861698150634766, "learning_rate": 0.0001, "loss": 1.6258, "loss/crossentropy": 2.6368483304977417, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.17269417643547058, "step": 1042 }, { "epoch": 0.01557425395142565, "grad_norm": 0.53125, "grad_norm_var": 0.0019569238026936847, "learning_rate": 0.0001, "loss": 1.84, "loss/crossentropy": 2.529175043106079, "loss/fcd": 1.62890625, "loss/idx": 6.0, "loss/logits": 0.21105806529521942, "step": 1043 }, { "epoch": 0.01558918612204063, "grad_norm": 0.5703125, "grad_norm_var": 0.0020630995432535807, "learning_rate": 0.0001, "loss": 1.8264, "loss/crossentropy": 2.712172746658325, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.2013680636882782, "step": 1044 }, { "epoch": 0.015604118292655612, "grad_norm": 0.71484375, "grad_norm_var": 0.004064671198527018, "learning_rate": 0.0001, "loss": 1.8359, "loss/crossentropy": 2.6467713117599487, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.19526171684265137, "step": 1045 }, { "epoch": 0.015619050463270593, "grad_norm": 0.53125, "grad_norm_var": 0.00401914914449056, "learning_rate": 0.0001, "loss": 1.7747, "loss/crossentropy": 2.707419753074646, "loss/fcd": 1.58984375, "loss/idx": 6.0, "loss/logits": 0.1848393678665161, "step": 1046 }, { "epoch": 0.015633982633885574, "grad_norm": 0.470703125, "grad_norm_var": 0.004239892959594727, "learning_rate": 0.0001, "loss": 1.6991, "loss/crossentropy": 2.640596628189087, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.18342873454093933, "step": 1047 }, { "epoch": 0.015648914804500556, "grad_norm": 0.51953125, "grad_norm_var": 0.003841511408487956, "learning_rate": 0.0001, "loss": 1.4506, "loss/crossentropy": 2.8776493072509766, "loss/fcd": 1.3125, "loss/idx": 6.0, "loss/logits": 0.13808635622262955, "step": 1048 }, { "epoch": 0.01566384697511554, "grad_norm": 0.462890625, "grad_norm_var": 0.004023170471191407, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.757124900817871, "loss/fcd": 1.30078125, "loss/idx": 6.0, "loss/logits": 0.14702700823545456, "step": 1049 }, { "epoch": 0.015678779145730518, "grad_norm": 0.5703125, "grad_norm_var": 0.00409393310546875, "learning_rate": 0.0001, "loss": 1.8706, "loss/crossentropy": 2.544234871864319, "loss/fcd": 1.65234375, "loss/idx": 6.0, "loss/logits": 0.21821290254592896, "step": 1050 }, { "epoch": 0.0156937113163455, "grad_norm": 0.5625, "grad_norm_var": 0.00395196278889974, "learning_rate": 0.0001, "loss": 1.9078, "loss/crossentropy": 2.8513458967208862, "loss/fcd": 1.68359375, "loss/idx": 6.0, "loss/logits": 0.2242419719696045, "step": 1051 }, { "epoch": 0.015708643486960484, "grad_norm": 0.4921875, "grad_norm_var": 0.0038098653157552084, "learning_rate": 0.0001, "loss": 1.7442, "loss/crossentropy": 2.4249703884124756, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.18165121227502823, "step": 1052 }, { "epoch": 0.015723575657575463, "grad_norm": 0.546875, "grad_norm_var": 0.0038146336873372396, "learning_rate": 0.0001, "loss": 1.6334, "loss/crossentropy": 2.3974103927612305, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.1802271157503128, "step": 1053 }, { "epoch": 0.015738507828190446, "grad_norm": 0.578125, "grad_norm_var": 0.0037732442220052083, "learning_rate": 0.0001, "loss": 1.7285, "loss/crossentropy": 2.800647497177124, "loss/fcd": 1.5390625, "loss/idx": 6.0, "loss/logits": 0.18946100026369095, "step": 1054 }, { "epoch": 0.015753439998805425, "grad_norm": 0.55078125, "grad_norm_var": 0.0036935806274414062, "learning_rate": 0.0001, "loss": 1.7413, "loss/crossentropy": 2.7701534032821655, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.21001631021499634, "step": 1055 }, { "epoch": 0.015768372169420408, "grad_norm": 1.1015625, "grad_norm_var": 0.02332909901936849, "learning_rate": 0.0001, "loss": 1.9825, "loss/crossentropy": 2.4670101404190063, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.25201089680194855, "step": 1056 }, { "epoch": 0.01578330434003539, "grad_norm": 0.46875, "grad_norm_var": 0.02403405507405599, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.56765079498291, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.20711445063352585, "step": 1057 }, { "epoch": 0.01579823651065037, "grad_norm": 0.53515625, "grad_norm_var": 0.02323296864827474, "learning_rate": 0.0001, "loss": 1.6809, "loss/crossentropy": 2.64453125, "loss/fcd": 1.49609375, "loss/idx": 6.0, "loss/logits": 0.1848081573843956, "step": 1058 }, { "epoch": 0.015813168681265352, "grad_norm": 0.515625, "grad_norm_var": 0.023340288798014322, "learning_rate": 0.0001, "loss": 1.7947, "loss/crossentropy": 2.6654093265533447, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.212711364030838, "step": 1059 }, { "epoch": 0.015828100851880335, "grad_norm": 0.5234375, "grad_norm_var": 0.023503557840983073, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.512184262275696, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.16342011094093323, "step": 1060 }, { "epoch": 0.015843033022495314, "grad_norm": 0.51953125, "grad_norm_var": 0.02215569814046224, "learning_rate": 0.0001, "loss": 1.7657, "loss/crossentropy": 2.6045628786087036, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.20317083597183228, "step": 1061 }, { "epoch": 0.015857965193110297, "grad_norm": 0.56640625, "grad_norm_var": 0.022101338704427084, "learning_rate": 0.0001, "loss": 1.7964, "loss/crossentropy": 2.3390029668807983, "loss/fcd": 1.61328125, "loss/idx": 6.0, "loss/logits": 0.18314598500728607, "step": 1062 }, { "epoch": 0.01587289736372528, "grad_norm": 0.515625, "grad_norm_var": 0.021683486302693684, "learning_rate": 0.0001, "loss": 1.6747, "loss/crossentropy": 2.6241334676742554, "loss/fcd": 1.48046875, "loss/idx": 6.0, "loss/logits": 0.19422227889299393, "step": 1063 }, { "epoch": 0.01588782953434026, "grad_norm": 0.484375, "grad_norm_var": 0.021970733006795248, "learning_rate": 0.0001, "loss": 1.639, "loss/crossentropy": 2.6619738340377808, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.16240306943655014, "step": 1064 }, { "epoch": 0.01590276170495524, "grad_norm": 0.50390625, "grad_norm_var": 0.02153313954671224, "learning_rate": 0.0001, "loss": 1.7279, "loss/crossentropy": 2.4656083583831787, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.1966322660446167, "step": 1065 }, { "epoch": 0.01591769387557022, "grad_norm": 0.73828125, "grad_norm_var": 0.0234222412109375, "learning_rate": 0.0001, "loss": 1.7699, "loss/crossentropy": 2.4857001304626465, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.19179469347000122, "step": 1066 }, { "epoch": 0.015932626046185203, "grad_norm": 0.498046875, "grad_norm_var": 0.023790979385375978, "learning_rate": 0.0001, "loss": 1.8223, "loss/crossentropy": 2.5338550806045532, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.22468920052051544, "step": 1067 }, { "epoch": 0.015947558216800186, "grad_norm": 0.4296875, "grad_norm_var": 0.02469328244527181, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.4822702407836914, "loss/fcd": 1.37109375, "loss/idx": 6.0, "loss/logits": 0.16255860030651093, "step": 1068 }, { "epoch": 0.015962490387415165, "grad_norm": 0.5234375, "grad_norm_var": 0.02479132016499837, "learning_rate": 0.0001, "loss": 1.7657, "loss/crossentropy": 2.417506456375122, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.18761204183101654, "step": 1069 }, { "epoch": 0.015977422558030148, "grad_norm": 0.51953125, "grad_norm_var": 0.024909575780232746, "learning_rate": 0.0001, "loss": 1.7423, "loss/crossentropy": 2.6273897886276245, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.19936949759721756, "step": 1070 }, { "epoch": 0.01599235472864513, "grad_norm": 0.54296875, "grad_norm_var": 0.02492521603902181, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.4808638095855713, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.15783234685659409, "step": 1071 }, { "epoch": 0.01600728689926011, "grad_norm": 0.55078125, "grad_norm_var": 0.004235061009724935, "learning_rate": 0.0001, "loss": 1.7248, "loss/crossentropy": 2.5770764350891113, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.19359392672777176, "step": 1072 }, { "epoch": 0.016022219069875093, "grad_norm": 0.65625, "grad_norm_var": 0.004970534642537435, "learning_rate": 0.0001, "loss": 1.9244, "loss/crossentropy": 2.357397437095642, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.22131529450416565, "step": 1073 }, { "epoch": 0.016037151240490075, "grad_norm": 0.73828125, "grad_norm_var": 0.007446781794230143, "learning_rate": 0.0001, "loss": 1.9803, "loss/crossentropy": 2.9358640909194946, "loss/fcd": 1.74609375, "loss/idx": 6.0, "loss/logits": 0.23415900021791458, "step": 1074 }, { "epoch": 0.016052083411105054, "grad_norm": 0.48046875, "grad_norm_var": 0.0076928297678629555, "learning_rate": 0.0001, "loss": 1.7242, "loss/crossentropy": 2.5545125007629395, "loss/fcd": 1.52734375, "loss/idx": 6.0, "loss/logits": 0.19681578129529953, "step": 1075 }, { "epoch": 0.016067015581720037, "grad_norm": 0.58984375, "grad_norm_var": 0.007738224665323893, "learning_rate": 0.0001, "loss": 1.9907, "loss/crossentropy": 1.9322530627250671, "loss/fcd": 1.7578125, "loss/idx": 6.0, "loss/logits": 0.23285391926765442, "step": 1076 }, { "epoch": 0.01608194775233502, "grad_norm": 0.486328125, "grad_norm_var": 0.007957903544108073, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.7540615797042847, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.1758873388171196, "step": 1077 }, { "epoch": 0.01609687992295, "grad_norm": 0.478515625, "grad_norm_var": 0.008266178766886394, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.6025326251983643, "loss/fcd": 1.40625, "loss/idx": 6.0, "loss/logits": 0.16653118282556534, "step": 1078 }, { "epoch": 0.016111812093564982, "grad_norm": 0.4765625, "grad_norm_var": 0.008519856135050456, "learning_rate": 0.0001, "loss": 1.7698, "loss/crossentropy": 2.574973702430725, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.21514993906021118, "step": 1079 }, { "epoch": 0.01612674426417996, "grad_norm": 0.4609375, "grad_norm_var": 0.008739201227823894, "learning_rate": 0.0001, "loss": 1.7841, "loss/crossentropy": 2.5307178497314453, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.1903010457754135, "step": 1080 }, { "epoch": 0.016141676434794944, "grad_norm": 0.53515625, "grad_norm_var": 0.00864103635152181, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.674168348312378, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.20304765552282333, "step": 1081 }, { "epoch": 0.016156608605409926, "grad_norm": 0.5234375, "grad_norm_var": 0.005962483088175456, "learning_rate": 0.0001, "loss": 1.7734, "loss/crossentropy": 2.5411276817321777, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.1913544237613678, "step": 1082 }, { "epoch": 0.016171540776024906, "grad_norm": 0.431640625, "grad_norm_var": 0.006526676813761393, "learning_rate": 0.0001, "loss": 1.5836, "loss/crossentropy": 2.050433099269867, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.14609526097774506, "step": 1083 }, { "epoch": 0.01618647294663989, "grad_norm": 0.5859375, "grad_norm_var": 0.006035852432250977, "learning_rate": 0.0001, "loss": 1.7856, "loss/crossentropy": 2.2005414366722107, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.18407931923866272, "step": 1084 }, { "epoch": 0.01620140511725487, "grad_norm": 0.498046875, "grad_norm_var": 0.006119537353515625, "learning_rate": 0.0001, "loss": 1.79, "loss/crossentropy": 2.430140733718872, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.2274610549211502, "step": 1085 }, { "epoch": 0.01621633728786985, "grad_norm": 0.4296875, "grad_norm_var": 0.00680535634358724, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.7481919527053833, "loss/fcd": 1.37890625, "loss/idx": 6.0, "loss/logits": 0.1583072543144226, "step": 1086 }, { "epoch": 0.016231269458484833, "grad_norm": 0.57421875, "grad_norm_var": 0.00692437489827474, "learning_rate": 0.0001, "loss": 1.8689, "loss/crossentropy": 2.4764903783798218, "loss/fcd": 1.61328125, "loss/idx": 6.0, "loss/logits": 0.255642831325531, "step": 1087 }, { "epoch": 0.016246201629099816, "grad_norm": 0.42578125, "grad_norm_var": 0.0075713475545247395, "learning_rate": 0.0001, "loss": 1.6201, "loss/crossentropy": 2.584195852279663, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1786525845527649, "step": 1088 }, { "epoch": 0.016261133799714795, "grad_norm": 0.5859375, "grad_norm_var": 0.0066329320271809895, "learning_rate": 0.0001, "loss": 1.7831, "loss/crossentropy": 2.3816990852355957, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.20887838304042816, "step": 1089 }, { "epoch": 0.016276065970329778, "grad_norm": 0.462890625, "grad_norm_var": 0.0033138116200764974, "learning_rate": 0.0001, "loss": 1.6247, "loss/crossentropy": 2.5489161014556885, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.16374288499355316, "step": 1090 }, { "epoch": 0.016290998140944757, "grad_norm": 0.48828125, "grad_norm_var": 0.0032956282297770183, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.699458956718445, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.176344595849514, "step": 1091 }, { "epoch": 0.01630593031155974, "grad_norm": 0.462890625, "grad_norm_var": 0.0028172810872395832, "learning_rate": 0.0001, "loss": 1.6489, "loss/crossentropy": 2.5479389429092407, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.18014751374721527, "step": 1092 }, { "epoch": 0.016320862482174722, "grad_norm": 0.498046875, "grad_norm_var": 0.0028136571248372397, "learning_rate": 0.0001, "loss": 1.7677, "loss/crossentropy": 2.7490711212158203, "loss/fcd": 1.55859375, "loss/idx": 6.0, "loss/logits": 0.2091096192598343, "step": 1093 }, { "epoch": 0.0163357946527897, "grad_norm": 1.046875, "grad_norm_var": 0.02176359494527181, "learning_rate": 0.0001, "loss": 1.9129, "loss/crossentropy": 2.8259459733963013, "loss/fcd": 1.66015625, "loss/idx": 6.0, "loss/logits": 0.25274983793497086, "step": 1094 }, { "epoch": 0.016350726823404684, "grad_norm": 0.47265625, "grad_norm_var": 0.02179258664449056, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.47867751121521, "loss/fcd": 1.36328125, "loss/idx": 6.0, "loss/logits": 0.14975561946630478, "step": 1095 }, { "epoch": 0.016365658994019667, "grad_norm": 0.435546875, "grad_norm_var": 0.02206719716389974, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.477361798286438, "loss/fcd": 1.390625, "loss/idx": 6.0, "loss/logits": 0.160676509141922, "step": 1096 }, { "epoch": 0.016380591164634646, "grad_norm": 0.4765625, "grad_norm_var": 0.02223027547200521, "learning_rate": 0.0001, "loss": 1.7071, "loss/crossentropy": 2.768059492111206, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.19536980986595154, "step": 1097 }, { "epoch": 0.01639552333524963, "grad_norm": 0.482421875, "grad_norm_var": 0.02234342892964681, "learning_rate": 0.0001, "loss": 1.7626, "loss/crossentropy": 2.445541024208069, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.2156984806060791, "step": 1098 }, { "epoch": 0.01641045550586461, "grad_norm": 0.50390625, "grad_norm_var": 0.021795908610026043, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.301167607307434, "loss/fcd": 1.4140625, "loss/idx": 6.0, "loss/logits": 0.15980088710784912, "step": 1099 }, { "epoch": 0.01642538767647959, "grad_norm": 0.40625, "grad_norm_var": 0.02239837646484375, "learning_rate": 0.0001, "loss": 1.6301, "loss/crossentropy": 2.4962133169174194, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.1769598126411438, "step": 1100 }, { "epoch": 0.016440319847094573, "grad_norm": 0.474609375, "grad_norm_var": 0.022487640380859375, "learning_rate": 0.0001, "loss": 1.7041, "loss/crossentropy": 2.6290271282196045, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.2041047364473343, "step": 1101 }, { "epoch": 0.016455252017709553, "grad_norm": 0.43359375, "grad_norm_var": 0.02244459788004557, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.6902036666870117, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.17346254736185074, "step": 1102 }, { "epoch": 0.016470184188324535, "grad_norm": 0.46484375, "grad_norm_var": 0.022319984436035157, "learning_rate": 0.0001, "loss": 1.7848, "loss/crossentropy": 2.574817419052124, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.20672458410263062, "step": 1103 }, { "epoch": 0.016485116358939518, "grad_norm": 0.52734375, "grad_norm_var": 0.02185713450113932, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.7195013761520386, "loss/fcd": 1.359375, "loss/idx": 6.0, "loss/logits": 0.14197393506765366, "step": 1104 }, { "epoch": 0.016500048529554497, "grad_norm": 0.59765625, "grad_norm_var": 0.021978251139322915, "learning_rate": 0.0001, "loss": 1.7151, "loss/crossentropy": 2.817270874977112, "loss/fcd": 1.52734375, "loss/idx": 6.0, "loss/logits": 0.18773505836725235, "step": 1105 }, { "epoch": 0.01651498070016948, "grad_norm": 0.6328125, "grad_norm_var": 0.022610203425089518, "learning_rate": 0.0001, "loss": 1.7907, "loss/crossentropy": 2.5092287063598633, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.22824940085411072, "step": 1106 }, { "epoch": 0.016529912870784463, "grad_norm": 0.6171875, "grad_norm_var": 0.02301303545633952, "learning_rate": 0.0001, "loss": 1.8864, "loss/crossentropy": 2.5941046476364136, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.2380032166838646, "step": 1107 }, { "epoch": 0.016544845041399442, "grad_norm": 0.48828125, "grad_norm_var": 0.02281487782796224, "learning_rate": 0.0001, "loss": 1.8045, "loss/crossentropy": 2.718872547149658, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.21072804927825928, "step": 1108 }, { "epoch": 0.016559777212014425, "grad_norm": 0.498046875, "grad_norm_var": 0.02281487782796224, "learning_rate": 0.0001, "loss": 1.6336, "loss/crossentropy": 2.7627276182174683, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.18051744997501373, "step": 1109 }, { "epoch": 0.016574709382629407, "grad_norm": 0.62109375, "grad_norm_var": 0.0050809224446614586, "learning_rate": 0.0001, "loss": 1.9104, "loss/crossentropy": 2.403494715690613, "loss/fcd": 1.69921875, "loss/idx": 6.0, "loss/logits": 0.21119916439056396, "step": 1110 }, { "epoch": 0.016589641553244387, "grad_norm": 0.4375, "grad_norm_var": 0.00532525380452474, "learning_rate": 0.0001, "loss": 1.6656, "loss/crossentropy": 2.596889853477478, "loss/fcd": 1.47265625, "loss/idx": 6.0, "loss/logits": 0.19291236251592636, "step": 1111 }, { "epoch": 0.01660457372385937, "grad_norm": 0.443359375, "grad_norm_var": 0.005255572001139323, "learning_rate": 0.0001, "loss": 1.5324, "loss/crossentropy": 2.6120318174362183, "loss/fcd": 1.3671875, "loss/idx": 6.0, "loss/logits": 0.16524401307106018, "step": 1112 }, { "epoch": 0.016619505894474352, "grad_norm": 0.54296875, "grad_norm_var": 0.005265299479166667, "learning_rate": 0.0001, "loss": 1.8961, "loss/crossentropy": 2.5684638023376465, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20081853866577148, "step": 1113 }, { "epoch": 0.01663443806508933, "grad_norm": 0.5546875, "grad_norm_var": 0.0053188165028889975, "learning_rate": 0.0001, "loss": 1.6964, "loss/crossentropy": 2.8103604316711426, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.19644811004400253, "step": 1114 }, { "epoch": 0.016649370235704314, "grad_norm": 0.5078125, "grad_norm_var": 0.005313857396443685, "learning_rate": 0.0001, "loss": 1.7268, "loss/crossentropy": 2.799278974533081, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.20339705049991608, "step": 1115 }, { "epoch": 0.016664302406319293, "grad_norm": 0.55078125, "grad_norm_var": 0.004514042536417643, "learning_rate": 0.0001, "loss": 1.7984, "loss/crossentropy": 2.626010537147522, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.20072130858898163, "step": 1116 }, { "epoch": 0.016679234576934276, "grad_norm": 0.53125, "grad_norm_var": 0.004337501525878906, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.4577295780181885, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.17310378700494766, "step": 1117 }, { "epoch": 0.01669416674754926, "grad_norm": 0.44140625, "grad_norm_var": 0.004242897033691406, "learning_rate": 0.0001, "loss": 1.6291, "loss/crossentropy": 2.533613443374634, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.18379881232976913, "step": 1118 }, { "epoch": 0.016709098918164238, "grad_norm": 0.48828125, "grad_norm_var": 0.004078102111816406, "learning_rate": 0.0001, "loss": 1.8031, "loss/crossentropy": 2.4966949224472046, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.2093115895986557, "step": 1119 }, { "epoch": 0.01672403108877922, "grad_norm": 0.5078125, "grad_norm_var": 0.004108937581380209, "learning_rate": 0.0001, "loss": 1.6413, "loss/crossentropy": 2.5867191553115845, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.17646171152591705, "step": 1120 }, { "epoch": 0.016738963259394203, "grad_norm": 0.494140625, "grad_norm_var": 0.0038284142812093098, "learning_rate": 0.0001, "loss": 1.7152, "loss/crossentropy": 2.971528172492981, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.20348946750164032, "step": 1121 }, { "epoch": 0.016753895430009182, "grad_norm": 0.5546875, "grad_norm_var": 0.0030591169993082683, "learning_rate": 0.0001, "loss": 1.7426, "loss/crossentropy": 2.5865492820739746, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.19963253289461136, "step": 1122 }, { "epoch": 0.016768827600624165, "grad_norm": 0.52734375, "grad_norm_var": 0.0023689111073811847, "learning_rate": 0.0001, "loss": 1.8772, "loss/crossentropy": 2.428372383117676, "loss/fcd": 1.65234375, "loss/idx": 6.0, "loss/logits": 0.22489413619041443, "step": 1123 }, { "epoch": 0.016783759771239148, "grad_norm": 0.458984375, "grad_norm_var": 0.0025145848592122394, "learning_rate": 0.0001, "loss": 1.6141, "loss/crossentropy": 2.5533376932144165, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.16882772743701935, "step": 1124 }, { "epoch": 0.016798691941854127, "grad_norm": 0.53125, "grad_norm_var": 0.002530527114868164, "learning_rate": 0.0001, "loss": 1.7549, "loss/crossentropy": 2.6719311475753784, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.20412559807300568, "step": 1125 }, { "epoch": 0.01681362411246911, "grad_norm": 0.44140625, "grad_norm_var": 0.0019368330637613933, "learning_rate": 0.0001, "loss": 1.6545, "loss/crossentropy": 2.4006348848342896, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.18963538110256195, "step": 1126 }, { "epoch": 0.01682855628308409, "grad_norm": 0.40234375, "grad_norm_var": 0.0023110548655192057, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.6121081113815308, "loss/fcd": 1.3515625, "loss/idx": 6.0, "loss/logits": 0.14626048505306244, "step": 1127 }, { "epoch": 0.01684348845369907, "grad_norm": 0.494140625, "grad_norm_var": 0.0020978132883707683, "learning_rate": 0.0001, "loss": 1.869, "loss/crossentropy": 2.564122796058655, "loss/fcd": 1.63671875, "loss/idx": 6.0, "loss/logits": 0.23223726451396942, "step": 1128 }, { "epoch": 0.016858420624314054, "grad_norm": 0.447265625, "grad_norm_var": 0.0021453221638997396, "learning_rate": 0.0001, "loss": 1.6052, "loss/crossentropy": 2.371549606323242, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.17157655954360962, "step": 1129 }, { "epoch": 0.016873352794929034, "grad_norm": 0.57421875, "grad_norm_var": 0.0023223876953125, "learning_rate": 0.0001, "loss": 1.8083, "loss/crossentropy": 2.6700668334960938, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.21453458815813065, "step": 1130 }, { "epoch": 0.016888284965544016, "grad_norm": 0.4375, "grad_norm_var": 0.002530670166015625, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.5727399587631226, "loss/fcd": 1.41796875, "loss/idx": 6.0, "loss/logits": 0.16292241215705872, "step": 1131 }, { "epoch": 0.016903217136159, "grad_norm": 0.95703125, "grad_norm_var": 0.01599299112955729, "learning_rate": 0.0001, "loss": 1.9989, "loss/crossentropy": 2.248908281326294, "loss/fcd": 1.75390625, "loss/idx": 6.0, "loss/logits": 0.24498894810676575, "step": 1132 }, { "epoch": 0.016918149306773978, "grad_norm": 0.4375, "grad_norm_var": 0.01637751261393229, "learning_rate": 0.0001, "loss": 1.6055, "loss/crossentropy": 2.702947497367859, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.17967890202999115, "step": 1133 }, { "epoch": 0.01693308147738896, "grad_norm": 0.53125, "grad_norm_var": 0.01603387196858724, "learning_rate": 0.0001, "loss": 1.7999, "loss/crossentropy": 2.477932572364807, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.2022503912448883, "step": 1134 }, { "epoch": 0.016948013648003944, "grad_norm": 0.486328125, "grad_norm_var": 0.01604180335998535, "learning_rate": 0.0001, "loss": 1.7251, "loss/crossentropy": 2.6051642894744873, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.20952476561069489, "step": 1135 }, { "epoch": 0.016962945818618923, "grad_norm": 0.41796875, "grad_norm_var": 0.016664743423461914, "learning_rate": 0.0001, "loss": 1.6029, "loss/crossentropy": 2.4434300661087036, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.17716213315725327, "step": 1136 }, { "epoch": 0.016977877989233905, "grad_norm": 0.48046875, "grad_norm_var": 0.016709136962890624, "learning_rate": 0.0001, "loss": 1.5944, "loss/crossentropy": 2.532817840576172, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.1686493381857872, "step": 1137 }, { "epoch": 0.016992810159848888, "grad_norm": 0.4921875, "grad_norm_var": 0.01659113566080729, "learning_rate": 0.0001, "loss": 1.7766, "loss/crossentropy": 2.6693389415740967, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.19849102199077606, "step": 1138 }, { "epoch": 0.017007742330463867, "grad_norm": 0.48046875, "grad_norm_var": 0.01660334269205729, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.757236123085022, "loss/fcd": 1.375, "loss/idx": 6.0, "loss/logits": 0.16651207208633423, "step": 1139 }, { "epoch": 0.01702267450107885, "grad_norm": 0.431640625, "grad_norm_var": 0.016815630594889323, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.5029356479644775, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1807950809597969, "step": 1140 }, { "epoch": 0.01703760667169383, "grad_norm": 0.71484375, "grad_norm_var": 0.019621531168619793, "learning_rate": 0.0001, "loss": 1.8059, "loss/crossentropy": 2.843307852745056, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.20043348520994186, "step": 1141 }, { "epoch": 0.017052538842308812, "grad_norm": 0.46484375, "grad_norm_var": 0.019428507486979166, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.6783461570739746, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.18495432287454605, "step": 1142 }, { "epoch": 0.017067471012923795, "grad_norm": 0.5, "grad_norm_var": 0.018549537658691405, "learning_rate": 0.0001, "loss": 1.6484, "loss/crossentropy": 2.5440350770950317, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1874828264117241, "step": 1143 }, { "epoch": 0.017082403183538774, "grad_norm": 0.4453125, "grad_norm_var": 0.01887815793355306, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.4569047689437866, "loss/fcd": 1.35546875, "loss/idx": 6.0, "loss/logits": 0.15451618283987045, "step": 1144 }, { "epoch": 0.017097335354153757, "grad_norm": 0.5078125, "grad_norm_var": 0.018530782063802084, "learning_rate": 0.0001, "loss": 1.6079, "loss/crossentropy": 2.605596423149109, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.16648275405168533, "step": 1145 }, { "epoch": 0.01711226752476874, "grad_norm": 0.484375, "grad_norm_var": 0.018415260314941406, "learning_rate": 0.0001, "loss": 1.7809, "loss/crossentropy": 2.4507850408554077, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.21446262300014496, "step": 1146 }, { "epoch": 0.01712719969538372, "grad_norm": 0.470703125, "grad_norm_var": 0.018132893244425456, "learning_rate": 0.0001, "loss": 1.5468, "loss/crossentropy": 2.5292779207229614, "loss/fcd": 1.3828125, "loss/idx": 6.0, "loss/logits": 0.16397973895072937, "step": 1147 }, { "epoch": 0.0171421318659987, "grad_norm": 0.4453125, "grad_norm_var": 0.004606993993123373, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.413945198059082, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.17828668653964996, "step": 1148 }, { "epoch": 0.017157064036613684, "grad_norm": 0.484375, "grad_norm_var": 0.004435332616170248, "learning_rate": 0.0001, "loss": 1.8044, "loss/crossentropy": 2.469904065132141, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.23020246624946594, "step": 1149 }, { "epoch": 0.017171996207228663, "grad_norm": 0.48828125, "grad_norm_var": 0.00431364377339681, "learning_rate": 0.0001, "loss": 1.7087, "loss/crossentropy": 2.64601993560791, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.19701046496629715, "step": 1150 }, { "epoch": 0.017186928377843646, "grad_norm": 0.4375, "grad_norm_var": 0.004468218485514323, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.6214022636413574, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.1954970881342888, "step": 1151 }, { "epoch": 0.017201860548458625, "grad_norm": 0.451171875, "grad_norm_var": 0.004244216283162435, "learning_rate": 0.0001, "loss": 1.6446, "loss/crossentropy": 2.631308913230896, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1836615949869156, "step": 1152 }, { "epoch": 0.017216792719073608, "grad_norm": 0.46484375, "grad_norm_var": 0.0042714277903238935, "learning_rate": 0.0001, "loss": 1.6015, "loss/crossentropy": 2.698973059654236, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.1718554049730301, "step": 1153 }, { "epoch": 0.01723172488968859, "grad_norm": 0.44921875, "grad_norm_var": 0.004346958796183268, "learning_rate": 0.0001, "loss": 1.6272, "loss/crossentropy": 2.672145128250122, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.1701432168483734, "step": 1154 }, { "epoch": 0.01724665706030357, "grad_norm": 0.49609375, "grad_norm_var": 0.00435789426167806, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.340711236000061, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.18080533295869827, "step": 1155 }, { "epoch": 0.017261589230918552, "grad_norm": 0.447265625, "grad_norm_var": 0.0042650699615478516, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.760603189468384, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.16555795073509216, "step": 1156 }, { "epoch": 0.017276521401533535, "grad_norm": 0.431640625, "grad_norm_var": 0.000579833984375, "learning_rate": 0.0001, "loss": 1.5944, "loss/crossentropy": 2.6986688375473022, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.1646973043680191, "step": 1157 }, { "epoch": 0.017291453572148514, "grad_norm": 0.515625, "grad_norm_var": 0.0007277806599934896, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.542388439178467, "loss/fcd": 1.40625, "loss/idx": 6.0, "loss/logits": 0.1613418385386467, "step": 1158 }, { "epoch": 0.017306385742763497, "grad_norm": 0.458984375, "grad_norm_var": 0.0006687005360921223, "learning_rate": 0.0001, "loss": 1.6341, "loss/crossentropy": 2.4559755325317383, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.17710646241903305, "step": 1159 }, { "epoch": 0.01732131791337848, "grad_norm": 0.4375, "grad_norm_var": 0.0006955305735270183, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.4609339237213135, "loss/fcd": 1.390625, "loss/idx": 6.0, "loss/logits": 0.16569382697343826, "step": 1160 }, { "epoch": 0.01733625008399346, "grad_norm": 0.3359375, "grad_norm_var": 0.0016047000885009766, "learning_rate": 0.0001, "loss": 1.6527, "loss/crossentropy": 2.657406806945801, "loss/fcd": 1.48828125, "loss/idx": 6.25, "loss/logits": 0.16446168720722198, "step": 1161 }, { "epoch": 0.01735118225460844, "grad_norm": 0.314453125, "grad_norm_var": 0.0027704238891601562, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.767332077026367, "loss/fcd": 1.2265625, "loss/idx": 6.5, "loss/logits": 0.14546091854572296, "step": 1162 }, { "epoch": 0.01736611442522342, "grad_norm": 0.3671875, "grad_norm_var": 0.0030930678049723307, "learning_rate": 0.0001, "loss": 1.6071, "loss/crossentropy": 2.622250199317932, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.18909041583538055, "step": 1163 }, { "epoch": 0.017381046595838404, "grad_norm": 0.30078125, "grad_norm_var": 0.0042786757151285805, "learning_rate": 0.0001, "loss": 1.4446, "loss/crossentropy": 2.528477191925049, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.15940909832715988, "step": 1164 }, { "epoch": 0.017395978766453386, "grad_norm": 0.298828125, "grad_norm_var": 0.005086517333984375, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.553607702255249, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.16601165384054184, "step": 1165 }, { "epoch": 0.017410910937068366, "grad_norm": 0.375, "grad_norm_var": 0.00483392079671224, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.3345470428466797, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.1863660141825676, "step": 1166 }, { "epoch": 0.017425843107683348, "grad_norm": 0.326171875, "grad_norm_var": 0.005220778783162435, "learning_rate": 0.0001, "loss": 1.7798, "loss/crossentropy": 2.4525299072265625, "loss/fcd": 1.54296875, "loss/idx": 6.5, "loss/logits": 0.23679041117429733, "step": 1167 }, { "epoch": 0.01744077527829833, "grad_norm": 0.3125, "grad_norm_var": 0.005558204650878906, "learning_rate": 0.0001, "loss": 1.5068, "loss/crossentropy": 2.396833300590515, "loss/fcd": 1.33984375, "loss/idx": 6.5, "loss/logits": 0.16692077368497849, "step": 1168 }, { "epoch": 0.01745570744891331, "grad_norm": 0.341796875, "grad_norm_var": 0.005370950698852539, "learning_rate": 0.0001, "loss": 1.5891, "loss/crossentropy": 2.688977599143982, "loss/fcd": 1.3984375, "loss/idx": 6.5, "loss/logits": 0.1906268373131752, "step": 1169 }, { "epoch": 0.017470639619528293, "grad_norm": 0.33203125, "grad_norm_var": 0.005273675918579102, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.6352808475494385, "loss/fcd": 1.39453125, "loss/idx": 6.5, "loss/logits": 0.17897118628025055, "step": 1170 }, { "epoch": 0.017485571790143276, "grad_norm": 0.314453125, "grad_norm_var": 0.004541969299316407, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.646498441696167, "loss/fcd": 1.375, "loss/idx": 6.5, "loss/logits": 0.17418432235717773, "step": 1171 }, { "epoch": 0.017500503960758255, "grad_norm": 0.396484375, "grad_norm_var": 0.004175821940104167, "learning_rate": 0.0001, "loss": 1.7385, "loss/crossentropy": 2.3632874488830566, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.23853551596403122, "step": 1172 }, { "epoch": 0.017515436131373237, "grad_norm": 0.3515625, "grad_norm_var": 0.0038780053456624348, "learning_rate": 0.0001, "loss": 1.6379, "loss/crossentropy": 2.709755778312683, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.18869873881340027, "step": 1173 }, { "epoch": 0.01753036830198822, "grad_norm": 0.3203125, "grad_norm_var": 0.0022408644358317058, "learning_rate": 0.0001, "loss": 1.6935, "loss/crossentropy": 2.5511062145233154, "loss/fcd": 1.46875, "loss/idx": 6.5, "loss/logits": 0.2247961387038231, "step": 1174 }, { "epoch": 0.0175453004726032, "grad_norm": 0.470703125, "grad_norm_var": 0.002421299616495768, "learning_rate": 0.0001, "loss": 1.7294, "loss/crossentropy": 2.720292091369629, "loss/fcd": 1.53125, "loss/idx": 6.5, "loss/logits": 0.1981579214334488, "step": 1175 }, { "epoch": 0.017560232643218182, "grad_norm": 0.30859375, "grad_norm_var": 0.0019513289133707681, "learning_rate": 0.0001, "loss": 1.4729, "loss/crossentropy": 2.636868119239807, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.17988184094429016, "step": 1176 }, { "epoch": 0.01757516481383316, "grad_norm": 0.3359375, "grad_norm_var": 0.0019513289133707681, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.7590854167938232, "loss/fcd": 1.23828125, "loss/idx": 6.5, "loss/logits": 0.15085097402334213, "step": 1177 }, { "epoch": 0.017590096984448144, "grad_norm": 0.341796875, "grad_norm_var": 0.001898813247680664, "learning_rate": 0.0001, "loss": 1.6978, "loss/crossentropy": 2.5070645809173584, "loss/fcd": 1.46484375, "loss/idx": 6.5, "loss/logits": 0.2329171895980835, "step": 1178 }, { "epoch": 0.017605029155063127, "grad_norm": 0.396484375, "grad_norm_var": 0.002045440673828125, "learning_rate": 0.0001, "loss": 1.8775, "loss/crossentropy": 2.5748108625411987, "loss/fcd": 1.609375, "loss/idx": 6.5, "loss/logits": 0.26812054216861725, "step": 1179 }, { "epoch": 0.017619961325678106, "grad_norm": 0.291015625, "grad_norm_var": 0.0021092573801676433, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.4305708408355713, "loss/fcd": 1.34375, "loss/idx": 6.5, "loss/logits": 0.1850065141916275, "step": 1180 }, { "epoch": 0.01763489349629309, "grad_norm": 0.29296875, "grad_norm_var": 0.0021471659342447917, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.5679367780685425, "loss/fcd": 1.26171875, "loss/idx": 6.5, "loss/logits": 0.1688312292098999, "step": 1181 }, { "epoch": 0.01764982566690807, "grad_norm": 0.4375, "grad_norm_var": 0.002647654215494792, "learning_rate": 0.0001, "loss": 1.9263, "loss/crossentropy": 2.317387104034424, "loss/fcd": 1.66796875, "loss/idx": 6.5, "loss/logits": 0.2583017721772194, "step": 1182 }, { "epoch": 0.01766475783752305, "grad_norm": 0.341796875, "grad_norm_var": 0.002617136637369792, "learning_rate": 0.0001, "loss": 1.6345, "loss/crossentropy": 2.413970947265625, "loss/fcd": 1.4375, "loss/idx": 6.5, "loss/logits": 0.1969573274254799, "step": 1183 }, { "epoch": 0.017679690008138033, "grad_norm": 0.36328125, "grad_norm_var": 0.002530352274576823, "learning_rate": 0.0001, "loss": 1.8208, "loss/crossentropy": 2.495847702026367, "loss/fcd": 1.609375, "loss/idx": 6.5, "loss/logits": 0.21144142746925354, "step": 1184 }, { "epoch": 0.017694622178753016, "grad_norm": 0.453125, "grad_norm_var": 0.0031491438547770183, "learning_rate": 0.0001, "loss": 1.8956, "loss/crossentropy": 2.4860759973526, "loss/fcd": 1.625, "loss/idx": 6.5, "loss/logits": 0.27056364715099335, "step": 1185 }, { "epoch": 0.017709554349367995, "grad_norm": 0.294921875, "grad_norm_var": 0.003369903564453125, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.79799485206604, "loss/fcd": 1.2578125, "loss/idx": 6.5, "loss/logits": 0.15437982231378555, "step": 1186 }, { "epoch": 0.017724486519982978, "grad_norm": 0.33203125, "grad_norm_var": 0.003289651870727539, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.386631965637207, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.2348131462931633, "step": 1187 }, { "epoch": 0.017739418690597957, "grad_norm": 0.30859375, "grad_norm_var": 0.00332183837890625, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.6373519897460938, "loss/fcd": 1.3125, "loss/idx": 6.5, "loss/logits": 0.1830146610736847, "step": 1188 }, { "epoch": 0.01775435086121294, "grad_norm": 0.296875, "grad_norm_var": 0.0035158793131510415, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.6739686727523804, "loss/fcd": 1.34375, "loss/idx": 6.5, "loss/logits": 0.1850878894329071, "step": 1189 }, { "epoch": 0.017769283031827923, "grad_norm": 0.283203125, "grad_norm_var": 0.0037444909413655598, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.4060449600219727, "loss/fcd": 1.26953125, "loss/idx": 6.5, "loss/logits": 0.15978636592626572, "step": 1190 }, { "epoch": 0.017784215202442902, "grad_norm": 0.2890625, "grad_norm_var": 0.0028058369954427082, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.5885671377182007, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.17616400122642517, "step": 1191 }, { "epoch": 0.017799147373057884, "grad_norm": 0.279296875, "grad_norm_var": 0.0029643853505452473, "learning_rate": 0.0001, "loss": 1.4515, "loss/crossentropy": 2.5950748920440674, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.16635886579751968, "step": 1192 }, { "epoch": 0.017814079543672867, "grad_norm": 0.337890625, "grad_norm_var": 0.002965227762858073, "learning_rate": 0.0001, "loss": 1.6403, "loss/crossentropy": 2.4442238807678223, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.19890712201595306, "step": 1193 }, { "epoch": 0.017829011714287846, "grad_norm": 0.796875, "grad_norm_var": 0.01639758745829264, "learning_rate": 0.0001, "loss": 1.904, "loss/crossentropy": 2.715569853782654, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.4079201966524124, "step": 1194 }, { "epoch": 0.01784394388490283, "grad_norm": 0.267578125, "grad_norm_var": 0.016846577326456707, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.5312520265579224, "loss/fcd": 1.21875, "loss/idx": 6.5, "loss/logits": 0.14967987686395645, "step": 1195 }, { "epoch": 0.017858876055517812, "grad_norm": 0.302734375, "grad_norm_var": 0.016756550470987955, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.6301435232162476, "loss/fcd": 1.33984375, "loss/idx": 6.5, "loss/logits": 0.18084491789340973, "step": 1196 }, { "epoch": 0.01787380822613279, "grad_norm": 0.2734375, "grad_norm_var": 0.016941563288370768, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.401396632194519, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.1575082242488861, "step": 1197 }, { "epoch": 0.017888740396747774, "grad_norm": 0.345703125, "grad_norm_var": 0.016441790262858073, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.4098260402679443, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.17468641698360443, "step": 1198 }, { "epoch": 0.017903672567362756, "grad_norm": 0.296875, "grad_norm_var": 0.01660447120666504, "learning_rate": 0.0001, "loss": 1.5407, "loss/crossentropy": 2.5780783891677856, "loss/fcd": 1.359375, "loss/idx": 6.5, "loss/logits": 0.18136019259691238, "step": 1199 }, { "epoch": 0.017918604737977736, "grad_norm": 0.30078125, "grad_norm_var": 0.016697041193644204, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.601935863494873, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.1659601628780365, "step": 1200 }, { "epoch": 0.01793353690859272, "grad_norm": 0.32421875, "grad_norm_var": 0.015811649958292644, "learning_rate": 0.0001, "loss": 1.5053, "loss/crossentropy": 2.5727096796035767, "loss/fcd": 1.32421875, "loss/idx": 6.5, "loss/logits": 0.18112681806087494, "step": 1201 }, { "epoch": 0.017948469079207698, "grad_norm": 0.341796875, "grad_norm_var": 0.015710179011027017, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.514320135116577, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.17511937022209167, "step": 1202 }, { "epoch": 0.01796340124982268, "grad_norm": 0.373046875, "grad_norm_var": 0.015793291727701823, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.3392175436019897, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.1891556680202484, "step": 1203 }, { "epoch": 0.017978333420437663, "grad_norm": 0.33203125, "grad_norm_var": 0.015733782450358072, "learning_rate": 0.0001, "loss": 1.6298, "loss/crossentropy": 2.6545450687408447, "loss/fcd": 1.43359375, "loss/idx": 6.5, "loss/logits": 0.1961742267012596, "step": 1204 }, { "epoch": 0.017993265591052642, "grad_norm": 0.353515625, "grad_norm_var": 0.015607945124308268, "learning_rate": 0.0001, "loss": 1.5463, "loss/crossentropy": 2.391342043876648, "loss/fcd": 1.37109375, "loss/idx": 6.5, "loss/logits": 0.17522381246089935, "step": 1205 }, { "epoch": 0.018008197761667625, "grad_norm": 0.5, "grad_norm_var": 0.01679884592692057, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.374518036842346, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.18116553127765656, "step": 1206 }, { "epoch": 0.018023129932282608, "grad_norm": 0.30859375, "grad_norm_var": 0.016645304361979165, "learning_rate": 0.0001, "loss": 1.6511, "loss/crossentropy": 2.563612222671509, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.2097008004784584, "step": 1207 }, { "epoch": 0.018038062102897587, "grad_norm": 0.36328125, "grad_norm_var": 0.01620036760965983, "learning_rate": 0.0001, "loss": 1.7428, "loss/crossentropy": 2.3562076091766357, "loss/fcd": 1.515625, "loss/idx": 6.5, "loss/logits": 0.2272082269191742, "step": 1208 }, { "epoch": 0.01805299427351257, "grad_norm": 0.357421875, "grad_norm_var": 0.016157134373982748, "learning_rate": 0.0001, "loss": 1.6028, "loss/crossentropy": 2.3315422534942627, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.188755564391613, "step": 1209 }, { "epoch": 0.018067926444127552, "grad_norm": 0.296875, "grad_norm_var": 0.002981678644816081, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.6092514991760254, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.17925221472978592, "step": 1210 }, { "epoch": 0.01808285861474253, "grad_norm": 0.30859375, "grad_norm_var": 0.002725664774576823, "learning_rate": 0.0001, "loss": 1.5431, "loss/crossentropy": 2.524160623550415, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.17589685320854187, "step": 1211 }, { "epoch": 0.018097790785357514, "grad_norm": 0.80859375, "grad_norm_var": 0.016463073094685872, "learning_rate": 0.0001, "loss": 1.7323, "loss/crossentropy": 2.454976439476013, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.2362159788608551, "step": 1212 }, { "epoch": 0.018112722955972493, "grad_norm": 0.30859375, "grad_norm_var": 0.01609800656636556, "learning_rate": 0.0001, "loss": 1.4605, "loss/crossentropy": 2.6214447021484375, "loss/fcd": 1.296875, "loss/idx": 6.5, "loss/logits": 0.163585864007473, "step": 1213 }, { "epoch": 0.018127655126587476, "grad_norm": 0.384765625, "grad_norm_var": 0.016066853205362955, "learning_rate": 0.0001, "loss": 1.6508, "loss/crossentropy": 2.786266326904297, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.22115909308195114, "step": 1214 }, { "epoch": 0.01814258729720246, "grad_norm": 0.3203125, "grad_norm_var": 0.015865055720011394, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.6109365224838257, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.21646693348884583, "step": 1215 }, { "epoch": 0.018157519467817438, "grad_norm": 0.333984375, "grad_norm_var": 0.015610249837239583, "learning_rate": 0.0001, "loss": 1.5222, "loss/crossentropy": 2.4061062335968018, "loss/fcd": 1.3515625, "loss/idx": 6.5, "loss/logits": 0.1706232950091362, "step": 1216 }, { "epoch": 0.01817245163843242, "grad_norm": 0.279296875, "grad_norm_var": 0.0160463809967041, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.537352442741394, "loss/fcd": 1.2890625, "loss/idx": 6.5, "loss/logits": 0.17248404771089554, "step": 1217 }, { "epoch": 0.018187383809047403, "grad_norm": 0.376953125, "grad_norm_var": 0.015976572036743165, "learning_rate": 0.0001, "loss": 1.6346, "loss/crossentropy": 2.5680431127548218, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.20487521588802338, "step": 1218 }, { "epoch": 0.018202315979662383, "grad_norm": 0.287109375, "grad_norm_var": 0.016464726130167643, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.602305054664612, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.16788026690483093, "step": 1219 }, { "epoch": 0.018217248150277365, "grad_norm": 0.306640625, "grad_norm_var": 0.01663354237874349, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 2.514701008796692, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.2090064287185669, "step": 1220 }, { "epoch": 0.018232180320892348, "grad_norm": 0.345703125, "grad_norm_var": 0.016652870178222656, "learning_rate": 0.0001, "loss": 1.7284, "loss/crossentropy": 2.7042866945266724, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.22837670892477036, "step": 1221 }, { "epoch": 0.018247112491507327, "grad_norm": 0.34765625, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.6267004013061523, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.21924810856580734, "step": 1222 }, { "epoch": 0.01826204466212231, "grad_norm": 0.31640625, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.599629044532776, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.17330200970172882, "step": 1223 }, { "epoch": 0.01827697683273729, "grad_norm": 0.369140625, "grad_norm_var": 0.015378046035766601, "learning_rate": 0.0001, "loss": 1.6224, "loss/crossentropy": 2.4942342042922974, "loss/fcd": 1.43359375, "loss/idx": 6.5, "loss/logits": 0.18882182240486145, "step": 1224 }, { "epoch": 0.018291909003352272, "grad_norm": 0.326171875, "grad_norm_var": 0.015446710586547851, "learning_rate": 0.0001, "loss": 1.5395, "loss/crossentropy": 2.627850890159607, "loss/fcd": 1.36328125, "loss/idx": 6.5, "loss/logits": 0.17617731541395187, "step": 1225 }, { "epoch": 0.018306841173967255, "grad_norm": 0.3359375, "grad_norm_var": 0.015227365493774413, "learning_rate": 0.0001, "loss": 1.6615, "loss/crossentropy": 2.448357105255127, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.2123221904039383, "step": 1226 }, { "epoch": 0.018321773344582234, "grad_norm": 0.333984375, "grad_norm_var": 0.015094502766927084, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.7975505590438843, "loss/fcd": 1.42578125, "loss/idx": 6.5, "loss/logits": 0.2022945061326027, "step": 1227 }, { "epoch": 0.018336705515197217, "grad_norm": 0.345703125, "grad_norm_var": 0.0008815606435139974, "learning_rate": 0.0001, "loss": 1.594, "loss/crossentropy": 2.6412123441696167, "loss/fcd": 1.3828125, "loss/idx": 6.5, "loss/logits": 0.21123310923576355, "step": 1228 }, { "epoch": 0.0183516376858122, "grad_norm": 0.3046875, "grad_norm_var": 0.0008949120839436849, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.7561702728271484, "loss/fcd": 1.3515625, "loss/idx": 6.5, "loss/logits": 0.1773766726255417, "step": 1229 }, { "epoch": 0.01836656985642718, "grad_norm": 0.328125, "grad_norm_var": 0.000698089599609375, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.3611074686050415, "loss/fcd": 1.41015625, "loss/idx": 6.5, "loss/logits": 0.19514558464288712, "step": 1230 }, { "epoch": 0.01838150202704216, "grad_norm": 0.306640625, "grad_norm_var": 0.0007249037424723307, "learning_rate": 0.0001, "loss": 1.6022, "loss/crossentropy": 2.4678847789764404, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.18031777441501617, "step": 1231 }, { "epoch": 0.018396434197657144, "grad_norm": 0.3203125, "grad_norm_var": 0.0007252375284830729, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.404633402824402, "loss/fcd": 1.328125, "loss/idx": 6.5, "loss/logits": 0.16343700140714645, "step": 1232 }, { "epoch": 0.018411366368272123, "grad_norm": 0.349609375, "grad_norm_var": 0.0005879084269205729, "learning_rate": 0.0001, "loss": 1.5751, "loss/crossentropy": 2.3156590461730957, "loss/fcd": 1.38671875, "loss/idx": 6.5, "loss/logits": 0.18834958225488663, "step": 1233 }, { "epoch": 0.018426298538887106, "grad_norm": 0.322265625, "grad_norm_var": 0.00044193267822265623, "learning_rate": 0.0001, "loss": 1.5267, "loss/crossentropy": 2.755373477935791, "loss/fcd": 1.34765625, "loss/idx": 6.5, "loss/logits": 0.17905279248952866, "step": 1234 }, { "epoch": 0.01844123070950209, "grad_norm": 0.302734375, "grad_norm_var": 0.00037225087483723957, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.919349431991577, "loss/fcd": 1.390625, "loss/idx": 6.5, "loss/logits": 0.21244197338819504, "step": 1235 }, { "epoch": 0.018456162880117068, "grad_norm": 0.376953125, "grad_norm_var": 0.0004729588826497396, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.5358701944351196, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.1933974176645279, "step": 1236 }, { "epoch": 0.01847109505073205, "grad_norm": 0.328125, "grad_norm_var": 0.0004630883534749349, "learning_rate": 0.0001, "loss": 1.7203, "loss/crossentropy": 2.363653302192688, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.22028225660324097, "step": 1237 }, { "epoch": 0.01848602722134703, "grad_norm": 0.326171875, "grad_norm_var": 0.0004475275675455729, "learning_rate": 0.0001, "loss": 1.4636, "loss/crossentropy": 2.452838659286499, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.17060783505439758, "step": 1238 }, { "epoch": 0.018500959391962012, "grad_norm": 0.298828125, "grad_norm_var": 0.0005005995432535807, "learning_rate": 0.0001, "loss": 1.5562, "loss/crossentropy": 2.486661434173584, "loss/fcd": 1.375, "loss/idx": 6.5, "loss/logits": 0.18124858289957047, "step": 1239 }, { "epoch": 0.018515891562576995, "grad_norm": 0.298828125, "grad_norm_var": 0.0004399458567301432, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.6643694639205933, "loss/fcd": 1.28125, "loss/idx": 6.5, "loss/logits": 0.17340320348739624, "step": 1240 }, { "epoch": 0.018530823733191974, "grad_norm": 0.27734375, "grad_norm_var": 0.0005833943684895833, "learning_rate": 0.0001, "loss": 1.4081, "loss/crossentropy": 2.4870904684066772, "loss/fcd": 1.2578125, "loss/idx": 6.5, "loss/logits": 0.1503349393606186, "step": 1241 }, { "epoch": 0.018545755903806957, "grad_norm": 0.326171875, "grad_norm_var": 0.0005715529123942058, "learning_rate": 0.0001, "loss": 1.5116, "loss/crossentropy": 2.79026997089386, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1795227974653244, "step": 1242 }, { "epoch": 0.01856068807442194, "grad_norm": 0.3515625, "grad_norm_var": 0.0006197611490885417, "learning_rate": 0.0001, "loss": 1.6242, "loss/crossentropy": 2.626877784729004, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.1945071667432785, "step": 1243 }, { "epoch": 0.01857562024503692, "grad_norm": 0.34765625, "grad_norm_var": 0.0006259759267171224, "learning_rate": 0.0001, "loss": 1.8053, "loss/crossentropy": 2.413367986679077, "loss/fcd": 1.546875, "loss/idx": 6.5, "loss/logits": 0.2584308609366417, "step": 1244 }, { "epoch": 0.0185905524156519, "grad_norm": 0.30078125, "grad_norm_var": 0.0006364027659098308, "learning_rate": 0.0001, "loss": 1.6274, "loss/crossentropy": 2.557702898979187, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.19771048426628113, "step": 1245 }, { "epoch": 0.018605484586266884, "grad_norm": 0.3125, "grad_norm_var": 0.0006402174631754558, "learning_rate": 0.0001, "loss": 1.7298, "loss/crossentropy": 2.6046078205108643, "loss/fcd": 1.5078125, "loss/idx": 6.5, "loss/logits": 0.2219381481409073, "step": 1246 }, { "epoch": 0.018620416756881863, "grad_norm": 0.431640625, "grad_norm_var": 0.0013665358225504558, "learning_rate": 0.0001, "loss": 1.915, "loss/crossentropy": 2.4264408349990845, "loss/fcd": 1.66796875, "loss/idx": 6.5, "loss/logits": 0.24700388312339783, "step": 1247 }, { "epoch": 0.018635348927496846, "grad_norm": 0.359375, "grad_norm_var": 0.0014142195383707683, "learning_rate": 0.0001, "loss": 1.6276, "loss/crossentropy": 2.7338656187057495, "loss/fcd": 1.4375, "loss/idx": 6.5, "loss/logits": 0.19006990641355515, "step": 1248 }, { "epoch": 0.018650281098111825, "grad_norm": 0.296875, "grad_norm_var": 0.0014635721842447917, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.4242324829101562, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.1799371838569641, "step": 1249 }, { "epoch": 0.018665213268726808, "grad_norm": 0.3828125, "grad_norm_var": 0.0016414483388264975, "learning_rate": 0.0001, "loss": 1.6714, "loss/crossentropy": 2.492332339286804, "loss/fcd": 1.45703125, "loss/idx": 6.5, "loss/logits": 0.2144075110554695, "step": 1250 }, { "epoch": 0.01868014543934179, "grad_norm": 0.306640625, "grad_norm_var": 0.0016269524892171224, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.542190194129944, "loss/fcd": 1.25, "loss/idx": 6.5, "loss/logits": 0.15106689184904099, "step": 1251 }, { "epoch": 0.01869507760995677, "grad_norm": 0.326171875, "grad_norm_var": 0.0014880975087483725, "learning_rate": 0.0001, "loss": 1.5506, "loss/crossentropy": 2.5000863075256348, "loss/fcd": 1.37109375, "loss/idx": 6.5, "loss/logits": 0.17947331815958023, "step": 1252 }, { "epoch": 0.018710009780571753, "grad_norm": 0.3203125, "grad_norm_var": 0.0014933109283447265, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.552633047103882, "loss/fcd": 1.30078125, "loss/idx": 6.5, "loss/logits": 0.18402951210737228, "step": 1253 }, { "epoch": 0.018724941951186735, "grad_norm": 0.7578125, "grad_norm_var": 0.012976328531901041, "learning_rate": 0.0001, "loss": 1.7571, "loss/crossentropy": 2.593714118003845, "loss/fcd": 1.5078125, "loss/idx": 6.5, "loss/logits": 0.24928182363510132, "step": 1254 }, { "epoch": 0.018739874121801715, "grad_norm": 0.283203125, "grad_norm_var": 0.013110605875651042, "learning_rate": 0.0001, "loss": 1.5159, "loss/crossentropy": 2.672044038772583, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1838703230023384, "step": 1255 }, { "epoch": 0.018754806292416697, "grad_norm": 0.3359375, "grad_norm_var": 0.012918837865193685, "learning_rate": 0.0001, "loss": 1.658, "loss/crossentropy": 2.4547826051712036, "loss/fcd": 1.4453125, "loss/idx": 6.5, "loss/logits": 0.21269508451223373, "step": 1256 }, { "epoch": 0.01876973846303168, "grad_norm": 0.310546875, "grad_norm_var": 0.012633768717447917, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.6580370664596558, "loss/fcd": 1.3359375, "loss/idx": 6.5, "loss/logits": 0.1681847870349884, "step": 1257 }, { "epoch": 0.01878467063364666, "grad_norm": 0.333984375, "grad_norm_var": 0.012602996826171876, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.8848483562469482, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.19028093665838242, "step": 1258 }, { "epoch": 0.018799602804261642, "grad_norm": 0.3828125, "grad_norm_var": 0.012629445393880208, "learning_rate": 0.0001, "loss": 1.678, "loss/crossentropy": 3.0120307207107544, "loss/fcd": 1.4765625, "loss/idx": 6.5, "loss/logits": 0.20148076117038727, "step": 1259 }, { "epoch": 0.01881453497487662, "grad_norm": 0.298828125, "grad_norm_var": 0.01287064552307129, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.5219684839248657, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.15666230767965317, "step": 1260 }, { "epoch": 0.018829467145491604, "grad_norm": 0.408203125, "grad_norm_var": 0.012761370340983073, "learning_rate": 0.0001, "loss": 1.6376, "loss/crossentropy": 2.52824604511261, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.1883331537246704, "step": 1261 }, { "epoch": 0.018844399316106587, "grad_norm": 0.326171875, "grad_norm_var": 0.012676477432250977, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.635972738265991, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.1809828281402588, "step": 1262 }, { "epoch": 0.018859331486721566, "grad_norm": 0.359375, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 1.5878, "loss/crossentropy": 2.927635669708252, "loss/fcd": 1.38671875, "loss/idx": 6.5, "loss/logits": 0.20111427456140518, "step": 1263 }, { "epoch": 0.01887426365733655, "grad_norm": 0.3671875, "grad_norm_var": 0.0123748779296875, "learning_rate": 0.0001, "loss": 1.5985, "loss/crossentropy": 2.4248982667922974, "loss/fcd": 1.41015625, "loss/idx": 6.5, "loss/logits": 0.18835779279470444, "step": 1264 }, { "epoch": 0.01888919582795153, "grad_norm": 0.333984375, "grad_norm_var": 0.012137206395467122, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.5499627590179443, "loss/fcd": 1.328125, "loss/idx": 6.5, "loss/logits": 0.16840286552906036, "step": 1265 }, { "epoch": 0.01890412799856651, "grad_norm": 0.33984375, "grad_norm_var": 0.012148396174112955, "learning_rate": 0.0001, "loss": 1.6923, "loss/crossentropy": 2.817864418029785, "loss/fcd": 1.48046875, "loss/idx": 6.5, "loss/logits": 0.21178434789180756, "step": 1266 }, { "epoch": 0.018919060169181493, "grad_norm": 0.337890625, "grad_norm_var": 0.011979023615519205, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.808348774909973, "loss/fcd": 1.40234375, "loss/idx": 6.5, "loss/logits": 0.189209446310997, "step": 1267 }, { "epoch": 0.018933992339796476, "grad_norm": 0.27734375, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.685523271560669, "loss/fcd": 1.23828125, "loss/idx": 6.5, "loss/logits": 0.16152724623680115, "step": 1268 }, { "epoch": 0.018948924510411455, "grad_norm": 0.28125, "grad_norm_var": 0.0126800537109375, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.490466594696045, "loss/fcd": 1.296875, "loss/idx": 6.5, "loss/logits": 0.17405615001916885, "step": 1269 }, { "epoch": 0.018963856681026438, "grad_norm": 0.31640625, "grad_norm_var": 0.0013503392537434896, "learning_rate": 0.0001, "loss": 1.5713, "loss/crossentropy": 2.6393624544143677, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.20406648516654968, "step": 1270 }, { "epoch": 0.01897878885164142, "grad_norm": 0.30078125, "grad_norm_var": 0.001258071263631185, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.5566498041152954, "loss/fcd": 1.30859375, "loss/idx": 6.5, "loss/logits": 0.172604039311409, "step": 1271 }, { "epoch": 0.0189937210222564, "grad_norm": 0.369140625, "grad_norm_var": 0.0013448079427083334, "learning_rate": 0.0001, "loss": 1.6218, "loss/crossentropy": 2.7665399312973022, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.1998923420906067, "step": 1272 }, { "epoch": 0.019008653192871382, "grad_norm": 0.359375, "grad_norm_var": 0.00134123166402181, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.761102795600891, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.1654694825410843, "step": 1273 }, { "epoch": 0.01902358536348636, "grad_norm": 0.404296875, "grad_norm_var": 0.0016216119130452475, "learning_rate": 0.0001, "loss": 1.776, "loss/crossentropy": 2.5646800994873047, "loss/fcd": 1.55078125, "loss/idx": 6.5, "loss/logits": 0.22524471580982208, "step": 1274 }, { "epoch": 0.019038517534101344, "grad_norm": 0.2890625, "grad_norm_var": 0.0016536553700764974, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.4658591747283936, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.19066184759140015, "step": 1275 }, { "epoch": 0.019053449704716327, "grad_norm": 0.333984375, "grad_norm_var": 0.001558669408162435, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.6678355932235718, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.19614477455615997, "step": 1276 }, { "epoch": 0.019068381875331306, "grad_norm": 0.30859375, "grad_norm_var": 0.0012433369954427083, "learning_rate": 0.0001, "loss": 1.6021, "loss/crossentropy": 2.784413456916809, "loss/fcd": 1.3984375, "loss/idx": 6.5, "loss/logits": 0.2036561220884323, "step": 1277 }, { "epoch": 0.01908331404594629, "grad_norm": 0.341796875, "grad_norm_var": 0.001247406005859375, "learning_rate": 0.0001, "loss": 1.6353, "loss/crossentropy": 2.580705761909485, "loss/fcd": 1.4453125, "loss/idx": 6.5, "loss/logits": 0.18997424840927124, "step": 1278 }, { "epoch": 0.01909824621656127, "grad_norm": 0.32421875, "grad_norm_var": 0.0011987686157226562, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.869256019592285, "loss/fcd": 1.234375, "loss/idx": 6.5, "loss/logits": 0.14993099868297577, "step": 1279 }, { "epoch": 0.01911317838717625, "grad_norm": 0.3515625, "grad_norm_var": 0.001137224833170573, "learning_rate": 0.0001, "loss": 1.6721, "loss/crossentropy": 2.6417490243911743, "loss/fcd": 1.46875, "loss/idx": 6.5, "loss/logits": 0.20330028980970383, "step": 1280 }, { "epoch": 0.019128110557791234, "grad_norm": 0.33984375, "grad_norm_var": 0.0011429945627848307, "learning_rate": 0.0001, "loss": 1.7471, "loss/crossentropy": 2.4359676837921143, "loss/fcd": 1.52734375, "loss/idx": 6.5, "loss/logits": 0.2198041006922722, "step": 1281 }, { "epoch": 0.019143042728406216, "grad_norm": 0.310546875, "grad_norm_var": 0.001157061258951823, "learning_rate": 0.0001, "loss": 1.4947, "loss/crossentropy": 2.415672779083252, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1626281514763832, "step": 1282 }, { "epoch": 0.019157974899021196, "grad_norm": 0.322265625, "grad_norm_var": 0.0011514663696289063, "learning_rate": 0.0001, "loss": 1.607, "loss/crossentropy": 2.599458336830139, "loss/fcd": 1.40234375, "loss/idx": 6.5, "loss/logits": 0.20462485402822495, "step": 1283 }, { "epoch": 0.019172907069636178, "grad_norm": 0.33984375, "grad_norm_var": 0.0009826024373372395, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.6432963609695435, "loss/fcd": 1.36328125, "loss/idx": 6.5, "loss/logits": 0.1764075607061386, "step": 1284 }, { "epoch": 0.019187839240251157, "grad_norm": 0.310546875, "grad_norm_var": 0.0008426507314046224, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.562265396118164, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.1901203915476799, "step": 1285 }, { "epoch": 0.01920277141086614, "grad_norm": 0.32421875, "grad_norm_var": 0.0008295536041259766, "learning_rate": 0.0001, "loss": 1.6828, "loss/crossentropy": 2.4806153774261475, "loss/fcd": 1.4765625, "loss/idx": 6.5, "loss/logits": 0.20623362064361572, "step": 1286 }, { "epoch": 0.019217703581481123, "grad_norm": 0.30078125, "grad_norm_var": 0.0008295536041259766, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.600746989250183, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.1778869926929474, "step": 1287 }, { "epoch": 0.019232635752096102, "grad_norm": 0.306640625, "grad_norm_var": 0.0007736047108968098, "learning_rate": 0.0001, "loss": 1.6329, "loss/crossentropy": 2.6411110162734985, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.21099568903446198, "step": 1288 }, { "epoch": 0.019247567922711085, "grad_norm": 0.259765625, "grad_norm_var": 0.0009932835896809896, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.5829832553863525, "loss/fcd": 1.30859375, "loss/idx": 6.5, "loss/logits": 0.17659380286931992, "step": 1289 }, { "epoch": 0.019262500093326067, "grad_norm": 0.314453125, "grad_norm_var": 0.0005238850911458334, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.631233811378479, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.185147225856781, "step": 1290 }, { "epoch": 0.019277432263941047, "grad_norm": 0.2890625, "grad_norm_var": 0.0005238850911458334, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.445384979248047, "loss/fcd": 1.2890625, "loss/idx": 6.5, "loss/logits": 0.15676811337471008, "step": 1291 }, { "epoch": 0.01929236443455603, "grad_norm": 0.341796875, "grad_norm_var": 0.0005449930826822917, "learning_rate": 0.0001, "loss": 1.6089, "loss/crossentropy": 2.795995831489563, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.190910205245018, "step": 1292 }, { "epoch": 0.019307296605171012, "grad_norm": 0.3203125, "grad_norm_var": 0.0005390803019205729, "learning_rate": 0.0001, "loss": 1.7172, "loss/crossentropy": 2.6674411296844482, "loss/fcd": 1.48828125, "loss/idx": 6.5, "loss/logits": 0.22890527546405792, "step": 1293 }, { "epoch": 0.01932222877578599, "grad_norm": 0.310546875, "grad_norm_var": 0.0005034764607747395, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.369943857192993, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.18096201121807098, "step": 1294 }, { "epoch": 0.019337160946400974, "grad_norm": 0.3203125, "grad_norm_var": 0.00050048828125, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.60583758354187, "loss/fcd": 1.30078125, "loss/idx": 6.5, "loss/logits": 0.1694255843758583, "step": 1295 }, { "epoch": 0.019352093117015957, "grad_norm": 0.361328125, "grad_norm_var": 0.000552225112915039, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.734460473060608, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.16532278805971146, "step": 1296 }, { "epoch": 0.019367025287630936, "grad_norm": 0.60546875, "grad_norm_var": 0.005770476659138998, "learning_rate": 0.0001, "loss": 1.6362, "loss/crossentropy": 2.6057682037353516, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1831134930253029, "step": 1297 }, { "epoch": 0.01938195745824592, "grad_norm": 0.86328125, "grad_norm_var": 0.023164876302083335, "learning_rate": 0.0001, "loss": 1.7227, "loss/crossentropy": 2.6192381381988525, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.19534087181091309, "step": 1298 }, { "epoch": 0.019396889628860898, "grad_norm": 0.7734375, "grad_norm_var": 0.033126052220662436, "learning_rate": 0.0001, "loss": 1.8283, "loss/crossentropy": 2.6694098711013794, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.21504506468772888, "step": 1299 }, { "epoch": 0.01941182179947588, "grad_norm": 0.67578125, "grad_norm_var": 0.03764786720275879, "learning_rate": 0.0001, "loss": 1.842, "loss/crossentropy": 2.5446014404296875, "loss/fcd": 1.62109375, "loss/idx": 7.0, "loss/logits": 0.22087354958057404, "step": 1300 }, { "epoch": 0.019426753970090863, "grad_norm": 0.7734375, "grad_norm_var": 0.04444732666015625, "learning_rate": 0.0001, "loss": 1.7877, "loss/crossentropy": 2.3532973527908325, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.17832990735769272, "step": 1301 }, { "epoch": 0.019441686140705843, "grad_norm": 0.6171875, "grad_norm_var": 0.04504337310791016, "learning_rate": 0.0001, "loss": 1.6963, "loss/crossentropy": 2.371212661266327, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.17282748222351074, "step": 1302 }, { "epoch": 0.019456618311320825, "grad_norm": 0.609375, "grad_norm_var": 0.04425481160481771, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.8181079626083374, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.17750699073076248, "step": 1303 }, { "epoch": 0.019471550481935808, "grad_norm": 0.6328125, "grad_norm_var": 0.043195708592732744, "learning_rate": 0.0001, "loss": 1.6662, "loss/crossentropy": 2.6716779470443726, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.17791379988193512, "step": 1304 }, { "epoch": 0.019486482652550787, "grad_norm": 0.55859375, "grad_norm_var": 0.03903477986653646, "learning_rate": 0.0001, "loss": 2.0067, "loss/crossentropy": 2.527832865715027, "loss/fcd": 1.73828125, "loss/idx": 7.0, "loss/logits": 0.26841960847377777, "step": 1305 }, { "epoch": 0.01950141482316577, "grad_norm": 0.56640625, "grad_norm_var": 0.0359981377919515, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.5668392181396484, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18021684139966965, "step": 1306 }, { "epoch": 0.019516346993780753, "grad_norm": 0.58203125, "grad_norm_var": 0.03161123593648275, "learning_rate": 0.0001, "loss": 1.8421, "loss/crossentropy": 2.521816849708557, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.21707086265087128, "step": 1307 }, { "epoch": 0.019531279164395732, "grad_norm": 0.4921875, "grad_norm_var": 0.02870941162109375, "learning_rate": 0.0001, "loss": 1.5868, "loss/crossentropy": 2.608349084854126, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.172712080180645, "step": 1308 }, { "epoch": 0.019546211335010714, "grad_norm": 0.60546875, "grad_norm_var": 0.024434852600097656, "learning_rate": 0.0001, "loss": 1.9755, "loss/crossentropy": 2.6307021379470825, "loss/fcd": 1.71875, "loss/idx": 7.0, "loss/logits": 0.2567453756928444, "step": 1309 }, { "epoch": 0.019561143505625694, "grad_norm": 0.61328125, "grad_norm_var": 0.0191158135732015, "learning_rate": 0.0001, "loss": 1.7592, "loss/crossentropy": 2.666864514350891, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2044796496629715, "step": 1310 }, { "epoch": 0.019576075676240676, "grad_norm": 0.498046875, "grad_norm_var": 0.01438751220703125, "learning_rate": 0.0001, "loss": 1.5998, "loss/crossentropy": 2.6031850576400757, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.1700783297419548, "step": 1311 }, { "epoch": 0.01959100784685566, "grad_norm": 0.65234375, "grad_norm_var": 0.009866444269816081, "learning_rate": 0.0001, "loss": 1.7639, "loss/crossentropy": 2.525311231613159, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.1935998499393463, "step": 1312 }, { "epoch": 0.01960594001747064, "grad_norm": 0.53515625, "grad_norm_var": 0.010428349177042643, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.5671015977859497, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17994633316993713, "step": 1313 }, { "epoch": 0.01962087218808562, "grad_norm": 0.439453125, "grad_norm_var": 0.008362325032552083, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.5429428815841675, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1670430600643158, "step": 1314 }, { "epoch": 0.019635804358700604, "grad_norm": 0.56640625, "grad_norm_var": 0.006296730041503907, "learning_rate": 0.0001, "loss": 1.8112, "loss/crossentropy": 2.7133418321609497, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.22525541484355927, "step": 1315 }, { "epoch": 0.019650736529315583, "grad_norm": 0.62890625, "grad_norm_var": 0.005889320373535156, "learning_rate": 0.0001, "loss": 1.7866, "loss/crossentropy": 2.3733800649642944, "loss/fcd": 1.58203125, "loss/idx": 7.0, "loss/logits": 0.2045341208577156, "step": 1316 }, { "epoch": 0.019665668699930566, "grad_norm": 0.61328125, "grad_norm_var": 0.0034833272298177083, "learning_rate": 0.0001, "loss": 1.9862, "loss/crossentropy": 2.2801279425621033, "loss/fcd": 1.75390625, "loss/idx": 7.0, "loss/logits": 0.23227345198392868, "step": 1317 }, { "epoch": 0.01968060087054555, "grad_norm": 0.44921875, "grad_norm_var": 0.004317156473795573, "learning_rate": 0.0001, "loss": 1.7337, "loss/crossentropy": 2.816790819168091, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.21028374135494232, "step": 1318 }, { "epoch": 0.019695533041160528, "grad_norm": 0.66015625, "grad_norm_var": 0.00477752685546875, "learning_rate": 0.0001, "loss": 1.9427, "loss/crossentropy": 2.5892642736434937, "loss/fcd": 1.71875, "loss/idx": 7.0, "loss/logits": 0.22399146854877472, "step": 1319 }, { "epoch": 0.01971046521177551, "grad_norm": 0.45703125, "grad_norm_var": 0.005198097229003907, "learning_rate": 0.0001, "loss": 1.6056, "loss/crossentropy": 2.3007309436798096, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1798677295446396, "step": 1320 }, { "epoch": 0.01972539738239049, "grad_norm": 0.45703125, "grad_norm_var": 0.005826250712076823, "learning_rate": 0.0001, "loss": 1.6315, "loss/crossentropy": 2.612833857536316, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18622201681137085, "step": 1321 }, { "epoch": 0.019740329553005472, "grad_norm": 0.546875, "grad_norm_var": 0.0058100382486979164, "learning_rate": 0.0001, "loss": 1.904, "loss/crossentropy": 2.3265438079833984, "loss/fcd": 1.6875, "loss/idx": 7.0, "loss/logits": 0.21648503094911575, "step": 1322 }, { "epoch": 0.019755261723620455, "grad_norm": 0.419921875, "grad_norm_var": 0.006755940119425456, "learning_rate": 0.0001, "loss": 1.7704, "loss/crossentropy": 2.4403117895126343, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.22352954745292664, "step": 1323 }, { "epoch": 0.019770193894235434, "grad_norm": 0.458984375, "grad_norm_var": 0.007035064697265625, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.571325421333313, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17575164139270782, "step": 1324 }, { "epoch": 0.019785126064850417, "grad_norm": 0.423828125, "grad_norm_var": 0.007453393936157226, "learning_rate": 0.0001, "loss": 1.8181, "loss/crossentropy": 2.5124125480651855, "loss/fcd": 1.58203125, "loss/idx": 7.0, "loss/logits": 0.23606669902801514, "step": 1325 }, { "epoch": 0.0198000582354654, "grad_norm": 0.4765625, "grad_norm_var": 0.007035048802693685, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.6320416927337646, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18130720406770706, "step": 1326 }, { "epoch": 0.01981499040608038, "grad_norm": 0.470703125, "grad_norm_var": 0.007153431574503581, "learning_rate": 0.0001, "loss": 1.7774, "loss/crossentropy": 2.6659634113311768, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2227148711681366, "step": 1327 }, { "epoch": 0.01982992257669536, "grad_norm": 0.54296875, "grad_norm_var": 0.005912637710571289, "learning_rate": 0.0001, "loss": 1.828, "loss/crossentropy": 2.837485194206238, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.23030738532543182, "step": 1328 }, { "epoch": 0.019844854747310344, "grad_norm": 0.453125, "grad_norm_var": 0.006048822402954101, "learning_rate": 0.0001, "loss": 1.6038, "loss/crossentropy": 2.6289491653442383, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18195120990276337, "step": 1329 }, { "epoch": 0.019859786917925323, "grad_norm": 0.455078125, "grad_norm_var": 0.00592954953511556, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.5986313819885254, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18099741637706757, "step": 1330 }, { "epoch": 0.019874719088540306, "grad_norm": 0.4453125, "grad_norm_var": 0.005854654312133789, "learning_rate": 0.0001, "loss": 1.6214, "loss/crossentropy": 2.4408994913101196, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17999741435050964, "step": 1331 }, { "epoch": 0.01988965125915529, "grad_norm": 0.4609375, "grad_norm_var": 0.004673624038696289, "learning_rate": 0.0001, "loss": 1.5941, "loss/crossentropy": 2.6180654764175415, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18002255260944366, "step": 1332 }, { "epoch": 0.019904583429770268, "grad_norm": 0.423828125, "grad_norm_var": 0.003725433349609375, "learning_rate": 0.0001, "loss": 1.5672, "loss/crossentropy": 2.5988270044326782, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.17657187581062317, "step": 1333 }, { "epoch": 0.01991951560038525, "grad_norm": 0.5703125, "grad_norm_var": 0.004224077860514323, "learning_rate": 0.0001, "loss": 1.8835, "loss/crossentropy": 2.5439051389694214, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.2389221414923668, "step": 1334 }, { "epoch": 0.01993444777100023, "grad_norm": 0.5625, "grad_norm_var": 0.0025090535481770834, "learning_rate": 0.0001, "loss": 1.7545, "loss/crossentropy": 2.7638096809387207, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.21539074927568436, "step": 1335 }, { "epoch": 0.019949379941615213, "grad_norm": 0.5, "grad_norm_var": 0.002512550354003906, "learning_rate": 0.0001, "loss": 1.5921, "loss/crossentropy": 2.695431113243103, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.1780625656247139, "step": 1336 }, { "epoch": 0.019964312112230195, "grad_norm": 0.47265625, "grad_norm_var": 0.002481524149576823, "learning_rate": 0.0001, "loss": 1.7354, "loss/crossentropy": 2.447667717933655, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.22754760086536407, "step": 1337 }, { "epoch": 0.019979244282845175, "grad_norm": 0.63671875, "grad_norm_var": 0.003784434000651042, "learning_rate": 0.0001, "loss": 1.9037, "loss/crossentropy": 2.5905754566192627, "loss/fcd": 1.66796875, "loss/idx": 7.0, "loss/logits": 0.2357194721698761, "step": 1338 }, { "epoch": 0.019994176453460157, "grad_norm": 0.453125, "grad_norm_var": 0.003561512629191081, "learning_rate": 0.0001, "loss": 1.7016, "loss/crossentropy": 2.56851863861084, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.18602270632982254, "step": 1339 }, { "epoch": 0.02000910862407514, "grad_norm": 0.5625, "grad_norm_var": 0.003831926981608073, "learning_rate": 0.0001, "loss": 1.9636, "loss/crossentropy": 2.431572914123535, "loss/fcd": 1.7265625, "loss/idx": 7.0, "loss/logits": 0.23702546209096909, "step": 1340 }, { "epoch": 0.02002404079469012, "grad_norm": 0.78515625, "grad_norm_var": 0.008592589696248373, "learning_rate": 0.0001, "loss": 2.0499, "loss/crossentropy": 2.5989317893981934, "loss/fcd": 1.765625, "loss/idx": 7.0, "loss/logits": 0.2842986583709717, "step": 1341 }, { "epoch": 0.020038972965305102, "grad_norm": 0.44921875, "grad_norm_var": 0.008786630630493165, "learning_rate": 0.0001, "loss": 1.7025, "loss/crossentropy": 2.804073929786682, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.21030206978321075, "step": 1342 }, { "epoch": 0.020053905135920085, "grad_norm": 0.5546875, "grad_norm_var": 0.008728535970052083, "learning_rate": 0.0001, "loss": 1.8545, "loss/crossentropy": 2.5097142457962036, "loss/fcd": 1.6328125, "loss/idx": 7.0, "loss/logits": 0.22171549499034882, "step": 1343 }, { "epoch": 0.020068837306535064, "grad_norm": 0.494140625, "grad_norm_var": 0.008731317520141602, "learning_rate": 0.0001, "loss": 1.8018, "loss/crossentropy": 2.5837600231170654, "loss/fcd": 1.5625, "loss/idx": 7.0, "loss/logits": 0.239334374666214, "step": 1344 }, { "epoch": 0.020083769477150046, "grad_norm": 0.44921875, "grad_norm_var": 0.008765776952107748, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.578310489654541, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1648903787136078, "step": 1345 }, { "epoch": 0.020098701647765026, "grad_norm": 0.44921875, "grad_norm_var": 0.00881646474202474, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.5372471809387207, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.1597445085644722, "step": 1346 }, { "epoch": 0.02011363381838001, "grad_norm": 0.44921875, "grad_norm_var": 0.008780161539713541, "learning_rate": 0.0001, "loss": 1.6982, "loss/crossentropy": 2.603432536125183, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.19433686882257462, "step": 1347 }, { "epoch": 0.02012856598899499, "grad_norm": 0.515625, "grad_norm_var": 0.008557637532552084, "learning_rate": 0.0001, "loss": 1.8823, "loss/crossentropy": 2.557613253593445, "loss/fcd": 1.6640625, "loss/idx": 7.0, "loss/logits": 0.21824145317077637, "step": 1348 }, { "epoch": 0.02014349815960997, "grad_norm": 0.4765625, "grad_norm_var": 0.008051665623982747, "learning_rate": 0.0001, "loss": 1.6971, "loss/crossentropy": 2.5176939964294434, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.18926101177930832, "step": 1349 }, { "epoch": 0.020158430330224953, "grad_norm": 0.482421875, "grad_norm_var": 0.007989438374837239, "learning_rate": 0.0001, "loss": 1.6574, "loss/crossentropy": 2.6994282007217407, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19250737130641937, "step": 1350 }, { "epoch": 0.020173362500839936, "grad_norm": 0.447265625, "grad_norm_var": 0.00814042091369629, "learning_rate": 0.0001, "loss": 1.6293, "loss/crossentropy": 2.5995413064956665, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18009082227945328, "step": 1351 }, { "epoch": 0.020188294671454915, "grad_norm": 0.51171875, "grad_norm_var": 0.008131647109985351, "learning_rate": 0.0001, "loss": 1.8459, "loss/crossentropy": 2.503365993499756, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.23260920494794846, "step": 1352 }, { "epoch": 0.020203226842069898, "grad_norm": 0.455078125, "grad_norm_var": 0.0082427978515625, "learning_rate": 0.0001, "loss": 1.5196, "loss/crossentropy": 2.564800977706909, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.15636174380779266, "step": 1353 }, { "epoch": 0.02021815901268488, "grad_norm": 0.4921875, "grad_norm_var": 0.007120704650878907, "learning_rate": 0.0001, "loss": 1.7418, "loss/crossentropy": 2.3416577577590942, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.1948934942483902, "step": 1354 }, { "epoch": 0.02023309118329986, "grad_norm": 0.4765625, "grad_norm_var": 0.007003211975097656, "learning_rate": 0.0001, "loss": 1.6993, "loss/crossentropy": 2.5845792293548584, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.20708955079317093, "step": 1355 }, { "epoch": 0.020248023353914842, "grad_norm": 0.412109375, "grad_norm_var": 0.007227182388305664, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.7967952489852905, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.20696670562028885, "step": 1356 }, { "epoch": 0.020262955524529825, "grad_norm": 0.40234375, "grad_norm_var": 0.00151365598042806, "learning_rate": 0.0001, "loss": 1.7359, "loss/crossentropy": 2.4202345609664917, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.21634161472320557, "step": 1357 }, { "epoch": 0.020277887695144804, "grad_norm": 0.408203125, "grad_norm_var": 0.0017316182454427083, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.58681583404541, "loss/fcd": 1.3125, "loss/idx": 7.0, "loss/logits": 0.15658579766750336, "step": 1358 }, { "epoch": 0.020292819865759787, "grad_norm": 0.5546875, "grad_norm_var": 0.0017316182454427083, "learning_rate": 0.0001, "loss": 1.67, "loss/crossentropy": 2.6079635620117188, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1777733862400055, "step": 1359 }, { "epoch": 0.020307752036374766, "grad_norm": 0.470703125, "grad_norm_var": 0.0016820271809895833, "learning_rate": 0.0001, "loss": 1.7681, "loss/crossentropy": 2.5722819566726685, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.20172813534736633, "step": 1360 }, { "epoch": 0.02032268420698975, "grad_norm": 0.490234375, "grad_norm_var": 0.0016963799794514974, "learning_rate": 0.0001, "loss": 1.7869, "loss/crossentropy": 2.726900815963745, "loss/fcd": 1.57421875, "loss/idx": 7.0, "loss/logits": 0.21265853196382523, "step": 1361 }, { "epoch": 0.02033761637760473, "grad_norm": 0.5078125, "grad_norm_var": 0.0017612298329671224, "learning_rate": 0.0001, "loss": 1.726, "loss/crossentropy": 2.5604859590530396, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.20253180712461472, "step": 1362 }, { "epoch": 0.02035254854821971, "grad_norm": 0.416015625, "grad_norm_var": 0.0019311904907226562, "learning_rate": 0.0001, "loss": 1.6416, "loss/crossentropy": 2.589616298675537, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.19624745845794678, "step": 1363 }, { "epoch": 0.020367480718834693, "grad_norm": 0.515625, "grad_norm_var": 0.0019311904907226562, "learning_rate": 0.0001, "loss": 1.9452, "loss/crossentropy": 2.620610475540161, "loss/fcd": 1.7109375, "loss/idx": 7.0, "loss/logits": 0.2342279702425003, "step": 1364 }, { "epoch": 0.020382412889449676, "grad_norm": 0.44921875, "grad_norm_var": 0.001953887939453125, "learning_rate": 0.0001, "loss": 1.7039, "loss/crossentropy": 2.894443392753601, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.18823960423469543, "step": 1365 }, { "epoch": 0.020397345060064655, "grad_norm": 0.431640625, "grad_norm_var": 0.002019182840983073, "learning_rate": 0.0001, "loss": 1.7526, "loss/crossentropy": 2.560238003730774, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.2174069955945015, "step": 1366 }, { "epoch": 0.020412277230679638, "grad_norm": 0.59375, "grad_norm_var": 0.003012196222941081, "learning_rate": 0.0001, "loss": 1.9253, "loss/crossentropy": 2.3431705236434937, "loss/fcd": 1.69140625, "loss/idx": 7.0, "loss/logits": 0.23390838503837585, "step": 1367 }, { "epoch": 0.02042720940129462, "grad_norm": 0.48046875, "grad_norm_var": 0.0029170831044514974, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.6611838340759277, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.18582233786582947, "step": 1368 }, { "epoch": 0.0204421415719096, "grad_norm": 0.515625, "grad_norm_var": 0.0030072530110677085, "learning_rate": 0.0001, "loss": 1.7996, "loss/crossentropy": 2.379249095916748, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.21370600908994675, "step": 1369 }, { "epoch": 0.020457073742524583, "grad_norm": 0.43359375, "grad_norm_var": 0.0030959447224934897, "learning_rate": 0.0001, "loss": 1.8179, "loss/crossentropy": 2.414939045906067, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.22020775079727173, "step": 1370 }, { "epoch": 0.020472005913139562, "grad_norm": 0.41796875, "grad_norm_var": 0.0032780965169270835, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.5957616567611694, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.1795307919383049, "step": 1371 }, { "epoch": 0.020486938083754545, "grad_norm": 0.46875, "grad_norm_var": 0.0030508518218994142, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.551032066345215, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.16551107168197632, "step": 1372 }, { "epoch": 0.020501870254369527, "grad_norm": 0.58984375, "grad_norm_var": 0.003499460220336914, "learning_rate": 0.0001, "loss": 1.902, "loss/crossentropy": 2.4808170795440674, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.245725117623806, "step": 1373 }, { "epoch": 0.020516802424984507, "grad_norm": 0.5390625, "grad_norm_var": 0.0032470703125, "learning_rate": 0.0001, "loss": 1.6204, "loss/crossentropy": 2.8979157209396362, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17895907908678055, "step": 1374 }, { "epoch": 0.02053173459559949, "grad_norm": 0.43359375, "grad_norm_var": 0.0031544367472330728, "learning_rate": 0.0001, "loss": 1.6163, "loss/crossentropy": 2.526008129119873, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.17878473550081253, "step": 1375 }, { "epoch": 0.020546666766214472, "grad_norm": 0.51953125, "grad_norm_var": 0.0032128492991129556, "learning_rate": 0.0001, "loss": 1.6617, "loss/crossentropy": 2.5934818983078003, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.20080163329839706, "step": 1376 }, { "epoch": 0.02056159893682945, "grad_norm": 0.439453125, "grad_norm_var": 0.0033566633860270183, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.480017900466919, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.1908370852470398, "step": 1377 }, { "epoch": 0.020576531107444434, "grad_norm": 0.5, "grad_norm_var": 0.003336191177368164, "learning_rate": 0.0001, "loss": 1.6938, "loss/crossentropy": 2.504163980484009, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.1859615296125412, "step": 1378 }, { "epoch": 0.020591463278059417, "grad_norm": 0.458984375, "grad_norm_var": 0.0030620416005452474, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.5576053857803345, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.17774324119091034, "step": 1379 }, { "epoch": 0.020606395448674396, "grad_norm": 0.400390625, "grad_norm_var": 0.0034474690755208334, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.699017286300659, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.16415952146053314, "step": 1380 }, { "epoch": 0.02062132761928938, "grad_norm": 0.396484375, "grad_norm_var": 0.0038341363271077473, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.818260431289673, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.19137796759605408, "step": 1381 }, { "epoch": 0.020636259789904358, "grad_norm": 0.408203125, "grad_norm_var": 0.004007705052693685, "learning_rate": 0.0001, "loss": 1.7416, "loss/crossentropy": 2.5644644498825073, "loss/fcd": 1.51171875, "loss/idx": 7.0, "loss/logits": 0.22985026240348816, "step": 1382 }, { "epoch": 0.02065119196051934, "grad_norm": 0.439453125, "grad_norm_var": 0.0030471165974934895, "learning_rate": 0.0001, "loss": 1.8247, "loss/crossentropy": 2.451001286506653, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.22313947975635529, "step": 1383 }, { "epoch": 0.020666124131134323, "grad_norm": 0.5, "grad_norm_var": 0.0031110127766927082, "learning_rate": 0.0001, "loss": 1.7051, "loss/crossentropy": 2.5355674028396606, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.2051396146416664, "step": 1384 }, { "epoch": 0.020681056301749302, "grad_norm": 0.458984375, "grad_norm_var": 0.0029390811920166015, "learning_rate": 0.0001, "loss": 1.6715, "loss/crossentropy": 2.6771148443222046, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.2105863317847252, "step": 1385 }, { "epoch": 0.020695988472364285, "grad_norm": 0.41796875, "grad_norm_var": 0.00301512082417806, "learning_rate": 0.0001, "loss": 1.6794, "loss/crossentropy": 2.6094021797180176, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.1910756528377533, "step": 1386 }, { "epoch": 0.020710920642979268, "grad_norm": 0.443359375, "grad_norm_var": 0.002907053629557292, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.6047022342681885, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.21599744260311127, "step": 1387 }, { "epoch": 0.020725852813594247, "grad_norm": 0.46875, "grad_norm_var": 0.002907053629557292, "learning_rate": 0.0001, "loss": 1.6647, "loss/crossentropy": 2.523642063140869, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18419228494167328, "step": 1388 }, { "epoch": 0.02074078498420923, "grad_norm": 0.4453125, "grad_norm_var": 0.0017755508422851562, "learning_rate": 0.0001, "loss": 1.6581, "loss/crossentropy": 2.8733028173446655, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.18934316188097, "step": 1389 }, { "epoch": 0.020755717154824212, "grad_norm": 0.51953125, "grad_norm_var": 0.0015787760416666667, "learning_rate": 0.0001, "loss": 1.8048, "loss/crossentropy": 2.4046658277511597, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.20321223884820938, "step": 1390 }, { "epoch": 0.02077064932543919, "grad_norm": 0.455078125, "grad_norm_var": 0.001551675796508789, "learning_rate": 0.0001, "loss": 1.7306, "loss/crossentropy": 2.4122198820114136, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.2071341872215271, "step": 1391 }, { "epoch": 0.020785581496054174, "grad_norm": 0.431640625, "grad_norm_var": 0.0012720108032226562, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.4567726850509644, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1747429072856903, "step": 1392 }, { "epoch": 0.020800513666669157, "grad_norm": 0.435546875, "grad_norm_var": 0.001277923583984375, "learning_rate": 0.0001, "loss": 1.5542, "loss/crossentropy": 2.69111704826355, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.1674729585647583, "step": 1393 }, { "epoch": 0.020815445837284136, "grad_norm": 0.44140625, "grad_norm_var": 0.0010919570922851562, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.527459144592285, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.19319240003824234, "step": 1394 }, { "epoch": 0.02083037800789912, "grad_norm": 0.4453125, "grad_norm_var": 0.0010782718658447266, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.6566779613494873, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1950736939907074, "step": 1395 }, { "epoch": 0.020845310178514098, "grad_norm": 0.482421875, "grad_norm_var": 0.0010195255279541015, "learning_rate": 0.0001, "loss": 1.7712, "loss/crossentropy": 2.42236065864563, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.2125757485628128, "step": 1396 }, { "epoch": 0.02086024234912908, "grad_norm": 0.470703125, "grad_norm_var": 0.0008407433827718099, "learning_rate": 0.0001, "loss": 1.7156, "loss/crossentropy": 2.53904128074646, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.2156321182847023, "step": 1397 }, { "epoch": 0.020875174519744064, "grad_norm": 0.43359375, "grad_norm_var": 0.0007260640462239584, "learning_rate": 0.0001, "loss": 1.573, "loss/crossentropy": 2.6393632888793945, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.1746109426021576, "step": 1398 }, { "epoch": 0.020890106690359043, "grad_norm": 0.4375, "grad_norm_var": 0.0007304986317952474, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.65228533744812, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.18801546841859818, "step": 1399 }, { "epoch": 0.020905038860974025, "grad_norm": 0.388671875, "grad_norm_var": 0.0008437474568684896, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.6110557317733765, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.19048527628183365, "step": 1400 }, { "epoch": 0.020919971031589008, "grad_norm": 0.421875, "grad_norm_var": 0.0008778731028238932, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.5177189111709595, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1728520169854164, "step": 1401 }, { "epoch": 0.020934903202203987, "grad_norm": 0.55078125, "grad_norm_var": 0.0014809767405192058, "learning_rate": 0.0001, "loss": 1.6969, "loss/crossentropy": 2.8006900548934937, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.1812676638364792, "step": 1402 }, { "epoch": 0.02094983537281897, "grad_norm": 0.546875, "grad_norm_var": 0.00199737548828125, "learning_rate": 0.0001, "loss": 1.6337, "loss/crossentropy": 2.6444613933563232, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18452748656272888, "step": 1403 }, { "epoch": 0.020964767543433953, "grad_norm": 0.515625, "grad_norm_var": 0.00218353271484375, "learning_rate": 0.0001, "loss": 1.6804, "loss/crossentropy": 2.775398850440979, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20384299010038376, "step": 1404 }, { "epoch": 0.020979699714048932, "grad_norm": 0.43359375, "grad_norm_var": 0.0022211074829101562, "learning_rate": 0.0001, "loss": 1.5523, "loss/crossentropy": 2.7846622467041016, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.1734142228960991, "step": 1405 }, { "epoch": 0.020994631884663915, "grad_norm": 0.416015625, "grad_norm_var": 0.0021124362945556642, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.86823308467865, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.15978942066431046, "step": 1406 }, { "epoch": 0.021009564055278894, "grad_norm": 0.51953125, "grad_norm_var": 0.0023584365844726562, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.6583101749420166, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18122605979442596, "step": 1407 }, { "epoch": 0.021024496225893877, "grad_norm": 0.5078125, "grad_norm_var": 0.002426004409790039, "learning_rate": 0.0001, "loss": 1.6574, "loss/crossentropy": 2.7387313842773438, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19252792745828629, "step": 1408 }, { "epoch": 0.02103942839650886, "grad_norm": 0.416015625, "grad_norm_var": 0.002527729670206706, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.737845778465271, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.15820758044719696, "step": 1409 }, { "epoch": 0.02105436056712384, "grad_norm": 0.431640625, "grad_norm_var": 0.0025634129842122396, "learning_rate": 0.0001, "loss": 1.6793, "loss/crossentropy": 2.6955381631851196, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.2027088850736618, "step": 1410 }, { "epoch": 0.02106929273773882, "grad_norm": 0.41796875, "grad_norm_var": 0.0026769002278645834, "learning_rate": 0.0001, "loss": 1.6027, "loss/crossentropy": 2.7337403297424316, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.19255182147026062, "step": 1411 }, { "epoch": 0.021084224908353804, "grad_norm": 0.48828125, "grad_norm_var": 0.002695067723592122, "learning_rate": 0.0001, "loss": 1.7799, "loss/crossentropy": 2.895545721054077, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.221343994140625, "step": 1412 }, { "epoch": 0.021099157078968783, "grad_norm": 0.5625, "grad_norm_var": 0.0033248265584309897, "learning_rate": 0.0001, "loss": 1.8734, "loss/crossentropy": 2.3325068950653076, "loss/fcd": 1.65234375, "loss/idx": 7.0, "loss/logits": 0.22110049426555634, "step": 1413 }, { "epoch": 0.021114089249583766, "grad_norm": 0.50390625, "grad_norm_var": 0.0033110936482747396, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.468624234199524, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.1866515427827835, "step": 1414 }, { "epoch": 0.02112902142019875, "grad_norm": 0.43359375, "grad_norm_var": 0.003330230712890625, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.677880644798279, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16050750017166138, "step": 1415 }, { "epoch": 0.021143953590813728, "grad_norm": 0.64453125, "grad_norm_var": 0.004573297500610351, "learning_rate": 0.0001, "loss": 2.297, "loss/crossentropy": 2.67445969581604, "loss/fcd": 2.01171875, "loss/idx": 7.0, "loss/logits": 0.28523435443639755, "step": 1416 }, { "epoch": 0.02115888576142871, "grad_norm": 0.52734375, "grad_norm_var": 0.004336404800415039, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.577675700187683, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.2216956913471222, "step": 1417 }, { "epoch": 0.021173817932043693, "grad_norm": 0.462890625, "grad_norm_var": 0.00416259765625, "learning_rate": 0.0001, "loss": 1.6366, "loss/crossentropy": 2.5695481300354004, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1835166960954666, "step": 1418 }, { "epoch": 0.021188750102658672, "grad_norm": 0.91015625, "grad_norm_var": 0.015201759338378907, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.8455076217651367, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.17739622294902802, "step": 1419 }, { "epoch": 0.021203682273273655, "grad_norm": 0.4921875, "grad_norm_var": 0.015224647521972657, "learning_rate": 0.0001, "loss": 1.7873, "loss/crossentropy": 2.5857309103012085, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.24437353014945984, "step": 1420 }, { "epoch": 0.021218614443888634, "grad_norm": 0.447265625, "grad_norm_var": 0.015096139907836915, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.4495460987091064, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.1749189794063568, "step": 1421 }, { "epoch": 0.021233546614503617, "grad_norm": 0.51953125, "grad_norm_var": 0.014450009663899739, "learning_rate": 0.0001, "loss": 1.7511, "loss/crossentropy": 2.408402919769287, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.21988791227340698, "step": 1422 }, { "epoch": 0.0212484787851186, "grad_norm": 0.55859375, "grad_norm_var": 0.014554278055826823, "learning_rate": 0.0001, "loss": 1.6551, "loss/crossentropy": 2.7392570972442627, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.19412636011838913, "step": 1423 }, { "epoch": 0.02126341095573358, "grad_norm": 0.455078125, "grad_norm_var": 0.014815632502237957, "learning_rate": 0.0001, "loss": 1.6443, "loss/crossentropy": 2.530651807785034, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.1716858074069023, "step": 1424 }, { "epoch": 0.02127834312634856, "grad_norm": 0.455078125, "grad_norm_var": 0.014385207494099935, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.560731530189514, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.16241511702537537, "step": 1425 }, { "epoch": 0.021293275296963544, "grad_norm": 0.470703125, "grad_norm_var": 0.014023447036743164, "learning_rate": 0.0001, "loss": 1.7572, "loss/crossentropy": 2.5136619806289673, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.2141972780227661, "step": 1426 }, { "epoch": 0.021308207467578524, "grad_norm": 0.51953125, "grad_norm_var": 0.01326139767964681, "learning_rate": 0.0001, "loss": 1.7078, "loss/crossentropy": 2.8287333250045776, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.20389527082443237, "step": 1427 }, { "epoch": 0.021323139638193506, "grad_norm": 0.494140625, "grad_norm_var": 0.01323235829671224, "learning_rate": 0.0001, "loss": 1.7222, "loss/crossentropy": 2.6399881839752197, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.20652509480714798, "step": 1428 }, { "epoch": 0.02133807180880849, "grad_norm": 0.451171875, "grad_norm_var": 0.013503249486287434, "learning_rate": 0.0001, "loss": 1.5779, "loss/crossentropy": 2.389400362968445, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17167968302965164, "step": 1429 }, { "epoch": 0.02135300397942347, "grad_norm": 0.443359375, "grad_norm_var": 0.013875261942545573, "learning_rate": 0.0001, "loss": 1.758, "loss/crossentropy": 2.4448758363723755, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.2306400090456009, "step": 1430 }, { "epoch": 0.02136793615003845, "grad_norm": 0.423828125, "grad_norm_var": 0.01399089495340983, "learning_rate": 0.0001, "loss": 1.6016, "loss/crossentropy": 2.542250633239746, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1797719970345497, "step": 1431 }, { "epoch": 0.02138286832065343, "grad_norm": 0.4609375, "grad_norm_var": 0.012980890274047852, "learning_rate": 0.0001, "loss": 1.788, "loss/crossentropy": 2.531215786933899, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.24498464167118073, "step": 1432 }, { "epoch": 0.021397800491268413, "grad_norm": 0.474609375, "grad_norm_var": 0.013002777099609375, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.677714705467224, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.18806182593107224, "step": 1433 }, { "epoch": 0.021412732661883396, "grad_norm": 0.56640625, "grad_norm_var": 0.013126611709594727, "learning_rate": 0.0001, "loss": 1.822, "loss/crossentropy": 2.5922250747680664, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.2282187044620514, "step": 1434 }, { "epoch": 0.021427664832498375, "grad_norm": 0.515625, "grad_norm_var": 0.0017478783925374349, "learning_rate": 0.0001, "loss": 1.7681, "loss/crossentropy": 2.8243170976638794, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.209464393556118, "step": 1435 }, { "epoch": 0.021442597003113358, "grad_norm": 0.5, "grad_norm_var": 0.001759958267211914, "learning_rate": 0.0001, "loss": 1.6335, "loss/crossentropy": 2.674209237098694, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1803731545805931, "step": 1436 }, { "epoch": 0.02145752917372834, "grad_norm": 0.466796875, "grad_norm_var": 0.0016862074534098306, "learning_rate": 0.0001, "loss": 1.6252, "loss/crossentropy": 2.531915068626404, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18767689168453217, "step": 1437 }, { "epoch": 0.02147246134434332, "grad_norm": 0.62109375, "grad_norm_var": 0.002785476048787435, "learning_rate": 0.0001, "loss": 1.7866, "loss/crossentropy": 2.2939316034317017, "loss/fcd": 1.6171875, "loss/idx": 7.0, "loss/logits": 0.16944652050733566, "step": 1438 }, { "epoch": 0.021487393514958302, "grad_norm": 0.474609375, "grad_norm_var": 0.0024840672810872394, "learning_rate": 0.0001, "loss": 1.7564, "loss/crossentropy": 2.6811925172805786, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.21343431621789932, "step": 1439 }, { "epoch": 0.021502325685573285, "grad_norm": 0.51953125, "grad_norm_var": 0.00246885617574056, "learning_rate": 0.0001, "loss": 1.8515, "loss/crossentropy": 2.4621083736419678, "loss/fcd": 1.62890625, "loss/idx": 7.0, "loss/logits": 0.2225768268108368, "step": 1440 }, { "epoch": 0.021517257856188264, "grad_norm": 0.6015625, "grad_norm_var": 0.0031066258748372396, "learning_rate": 0.0001, "loss": 2.0026, "loss/crossentropy": 2.472296357154846, "loss/fcd": 1.74609375, "loss/idx": 7.0, "loss/logits": 0.25650446116924286, "step": 1441 }, { "epoch": 0.021532190026803247, "grad_norm": 0.458984375, "grad_norm_var": 0.0031613667805989584, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.822437882423401, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18909186869859695, "step": 1442 }, { "epoch": 0.021547122197418226, "grad_norm": 0.50390625, "grad_norm_var": 0.003134918212890625, "learning_rate": 0.0001, "loss": 1.5838, "loss/crossentropy": 2.8150917291641235, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17753300070762634, "step": 1443 }, { "epoch": 0.02156205436803321, "grad_norm": 0.71875, "grad_norm_var": 0.006156396865844726, "learning_rate": 0.0001, "loss": 1.9223, "loss/crossentropy": 2.175217628479004, "loss/fcd": 1.69140625, "loss/idx": 7.0, "loss/logits": 0.2308632880449295, "step": 1444 }, { "epoch": 0.02157698653864819, "grad_norm": 0.412109375, "grad_norm_var": 0.006571563084920248, "learning_rate": 0.0001, "loss": 1.5694, "loss/crossentropy": 2.54789400100708, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.170965775847435, "step": 1445 }, { "epoch": 0.02159191870926317, "grad_norm": 0.4921875, "grad_norm_var": 0.006285858154296875, "learning_rate": 0.0001, "loss": 1.6893, "loss/crossentropy": 2.5268924236297607, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.18932264298200607, "step": 1446 }, { "epoch": 0.021606850879878153, "grad_norm": 0.447265625, "grad_norm_var": 0.00604095458984375, "learning_rate": 0.0001, "loss": 1.6026, "loss/crossentropy": 2.735979676246643, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18071038275957108, "step": 1447 }, { "epoch": 0.021621783050493136, "grad_norm": 0.609375, "grad_norm_var": 0.0063550313313802086, "learning_rate": 0.0001, "loss": 1.685, "loss/crossentropy": 2.3301628828048706, "loss/fcd": 1.51171875, "loss/idx": 7.0, "loss/logits": 0.1733197569847107, "step": 1448 }, { "epoch": 0.021636715221108115, "grad_norm": 0.45703125, "grad_norm_var": 0.0064899285634358725, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.946329712867737, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18357960879802704, "step": 1449 }, { "epoch": 0.021651647391723098, "grad_norm": 0.546875, "grad_norm_var": 0.006400283177693685, "learning_rate": 0.0001, "loss": 1.8002, "loss/crossentropy": 2.6100656986236572, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.21426425874233246, "step": 1450 }, { "epoch": 0.02166657956233808, "grad_norm": 0.546875, "grad_norm_var": 0.006436395645141602, "learning_rate": 0.0001, "loss": 1.8881, "loss/crossentropy": 2.5633318424224854, "loss/fcd": 1.63671875, "loss/idx": 7.0, "loss/logits": 0.2513733506202698, "step": 1451 }, { "epoch": 0.02168151173295306, "grad_norm": 0.439453125, "grad_norm_var": 0.00685571034749349, "learning_rate": 0.0001, "loss": 1.6285, "loss/crossentropy": 2.5173341035842896, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.19103511422872543, "step": 1452 }, { "epoch": 0.021696443903568043, "grad_norm": 0.6171875, "grad_norm_var": 0.007206964492797852, "learning_rate": 0.0001, "loss": 2.1584, "loss/crossentropy": 2.5997310876846313, "loss/fcd": 1.87890625, "loss/idx": 7.0, "loss/logits": 0.27947692573070526, "step": 1453 }, { "epoch": 0.021711376074183025, "grad_norm": 0.48828125, "grad_norm_var": 0.006681680679321289, "learning_rate": 0.0001, "loss": 1.7712, "loss/crossentropy": 2.172030210494995, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.20478252321481705, "step": 1454 }, { "epoch": 0.021726308244798005, "grad_norm": 0.498046875, "grad_norm_var": 0.006571435928344726, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.5943312644958496, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.20607079565525055, "step": 1455 }, { "epoch": 0.021741240415412987, "grad_norm": 0.5390625, "grad_norm_var": 0.006587966283162435, "learning_rate": 0.0001, "loss": 1.7219, "loss/crossentropy": 2.6256524324417114, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.20627319812774658, "step": 1456 }, { "epoch": 0.021756172586027966, "grad_norm": 0.48828125, "grad_norm_var": 0.00621183713277181, "learning_rate": 0.0001, "loss": 1.7206, "loss/crossentropy": 2.4680287837982178, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.19329256564378738, "step": 1457 }, { "epoch": 0.02177110475664295, "grad_norm": 0.53125, "grad_norm_var": 0.005984242757161458, "learning_rate": 0.0001, "loss": 1.7452, "loss/crossentropy": 2.1940962076187134, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.20221325010061264, "step": 1458 }, { "epoch": 0.021786036927257932, "grad_norm": 0.546875, "grad_norm_var": 0.006001726786295573, "learning_rate": 0.0001, "loss": 1.8579, "loss/crossentropy": 2.3581950664520264, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.21334326267242432, "step": 1459 }, { "epoch": 0.02180096909787291, "grad_norm": 0.53125, "grad_norm_var": 0.0033222834269205728, "learning_rate": 0.0001, "loss": 1.5861, "loss/crossentropy": 2.622925043106079, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17985717952251434, "step": 1460 }, { "epoch": 0.021815901268487894, "grad_norm": 0.447265625, "grad_norm_var": 0.0029314676920572918, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.6179198026657104, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.16600077599287033, "step": 1461 }, { "epoch": 0.021830833439102876, "grad_norm": 0.474609375, "grad_norm_var": 0.003002278010050456, "learning_rate": 0.0001, "loss": 1.71, "loss/crossentropy": 2.601523756980896, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19441629946231842, "step": 1462 }, { "epoch": 0.021845765609717856, "grad_norm": 0.478515625, "grad_norm_var": 0.002789163589477539, "learning_rate": 0.0001, "loss": 1.5912, "loss/crossentropy": 2.3901792764663696, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18108409643173218, "step": 1463 }, { "epoch": 0.02186069778033284, "grad_norm": 0.4375, "grad_norm_var": 0.0024730523427327475, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.7158310413360596, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.18390782922506332, "step": 1464 }, { "epoch": 0.02187562995094782, "grad_norm": 0.431640625, "grad_norm_var": 0.0026732762654622395, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.8458696603775024, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.17150548100471497, "step": 1465 }, { "epoch": 0.0218905621215628, "grad_norm": 0.5, "grad_norm_var": 0.0025344212849934896, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.5054997205734253, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.15543527156114578, "step": 1466 }, { "epoch": 0.021905494292177783, "grad_norm": 0.392578125, "grad_norm_var": 0.003053013483683268, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.427425265312195, "loss/fcd": 1.328125, "loss/idx": 7.0, "loss/logits": 0.15352777391672134, "step": 1467 }, { "epoch": 0.021920426462792762, "grad_norm": 0.490234375, "grad_norm_var": 0.0028711795806884766, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.5839306116104126, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1674559861421585, "step": 1468 }, { "epoch": 0.021935358633407745, "grad_norm": 0.478515625, "grad_norm_var": 0.0017821629842122396, "learning_rate": 0.0001, "loss": 1.7259, "loss/crossentropy": 2.6946284770965576, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.19463558495044708, "step": 1469 }, { "epoch": 0.021950290804022728, "grad_norm": 0.44140625, "grad_norm_var": 0.0018966039021809896, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.6268084049224854, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.17897958308458328, "step": 1470 }, { "epoch": 0.021965222974637707, "grad_norm": 0.48828125, "grad_norm_var": 0.0018812656402587891, "learning_rate": 0.0001, "loss": 1.7699, "loss/crossentropy": 2.4048478603363037, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.19957569986581802, "step": 1471 }, { "epoch": 0.02198015514525269, "grad_norm": 0.5, "grad_norm_var": 0.001674636205037435, "learning_rate": 0.0001, "loss": 1.6641, "loss/crossentropy": 2.7484490871429443, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.19537898898124695, "step": 1472 }, { "epoch": 0.021995087315867672, "grad_norm": 0.42578125, "grad_norm_var": 0.0018384138743082683, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.5969239473342896, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18410242348909378, "step": 1473 }, { "epoch": 0.02201001948648265, "grad_norm": 0.44921875, "grad_norm_var": 0.0016408125559488932, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.6116435527801514, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1774698942899704, "step": 1474 }, { "epoch": 0.022024951657097634, "grad_norm": 0.44140625, "grad_norm_var": 0.0012494246164957681, "learning_rate": 0.0001, "loss": 1.6374, "loss/crossentropy": 2.5894148349761963, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.1920812875032425, "step": 1475 }, { "epoch": 0.022039883827712617, "grad_norm": 0.63671875, "grad_norm_var": 0.002904240290323893, "learning_rate": 0.0001, "loss": 2.0558, "loss/crossentropy": 2.587377429008484, "loss/fcd": 1.8125, "loss/idx": 7.0, "loss/logits": 0.24327364563941956, "step": 1476 }, { "epoch": 0.022054815998327596, "grad_norm": 0.51171875, "grad_norm_var": 0.002971903483072917, "learning_rate": 0.0001, "loss": 1.828, "loss/crossentropy": 2.747790813446045, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.2263966202735901, "step": 1477 }, { "epoch": 0.02206974816894258, "grad_norm": 0.46875, "grad_norm_var": 0.0029732863108317058, "learning_rate": 0.0001, "loss": 1.768, "loss/crossentropy": 2.6247669458389282, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.213314987719059, "step": 1478 }, { "epoch": 0.02208468033955756, "grad_norm": 0.6640625, "grad_norm_var": 0.005254872639973958, "learning_rate": 0.0001, "loss": 1.8736, "loss/crossentropy": 2.40477979183197, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.213446743786335, "step": 1479 }, { "epoch": 0.02209961251017254, "grad_norm": 0.46875, "grad_norm_var": 0.005118560791015625, "learning_rate": 0.0001, "loss": 1.6391, "loss/crossentropy": 2.8101799488067627, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.19768545031547546, "step": 1480 }, { "epoch": 0.022114544680787523, "grad_norm": 0.48828125, "grad_norm_var": 0.004902378718058268, "learning_rate": 0.0001, "loss": 1.6937, "loss/crossentropy": 2.3495898246765137, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.18976981192827225, "step": 1481 }, { "epoch": 0.022129476851402503, "grad_norm": 0.47265625, "grad_norm_var": 0.004913949966430664, "learning_rate": 0.0001, "loss": 1.6884, "loss/crossentropy": 2.4331902265548706, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1962270364165306, "step": 1482 }, { "epoch": 0.022144409022017485, "grad_norm": 0.416015625, "grad_norm_var": 0.004648065567016602, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.6590933799743652, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1965487003326416, "step": 1483 }, { "epoch": 0.022159341192632468, "grad_norm": 0.55078125, "grad_norm_var": 0.0048781712849934895, "learning_rate": 0.0001, "loss": 1.7436, "loss/crossentropy": 2.599213719367981, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.22408346831798553, "step": 1484 }, { "epoch": 0.022174273363247447, "grad_norm": 0.4609375, "grad_norm_var": 0.0049335320790608725, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.6557412147521973, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.19689878821372986, "step": 1485 }, { "epoch": 0.02218920553386243, "grad_norm": 0.486328125, "grad_norm_var": 0.004751841227213542, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.529180407524109, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.20145351439714432, "step": 1486 }, { "epoch": 0.022204137704477413, "grad_norm": 0.46875, "grad_norm_var": 0.004794756571451823, "learning_rate": 0.0001, "loss": 1.6538, "loss/crossentropy": 2.4987757205963135, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.17723794281482697, "step": 1487 }, { "epoch": 0.022219069875092392, "grad_norm": 0.439453125, "grad_norm_var": 0.004978545506795247, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.609381675720215, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18409955501556396, "step": 1488 }, { "epoch": 0.022234002045707375, "grad_norm": 0.474609375, "grad_norm_var": 0.004705556233723958, "learning_rate": 0.0001, "loss": 1.7746, "loss/crossentropy": 2.508987545967102, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2199431136250496, "step": 1489 }, { "epoch": 0.022248934216322357, "grad_norm": 0.62890625, "grad_norm_var": 0.005658976236979167, "learning_rate": 0.0001, "loss": 1.89, "loss/crossentropy": 2.3111783266067505, "loss/fcd": 1.67578125, "loss/idx": 7.0, "loss/logits": 0.21425354480743408, "step": 1490 }, { "epoch": 0.022263866386937337, "grad_norm": 0.640625, "grad_norm_var": 0.006453386942545573, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.5704126358032227, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.19285417348146439, "step": 1491 }, { "epoch": 0.02227879855755232, "grad_norm": 0.498046875, "grad_norm_var": 0.005447880427042643, "learning_rate": 0.0001, "loss": 1.7764, "loss/crossentropy": 2.5152299404144287, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.22169267386198044, "step": 1492 }, { "epoch": 0.0222937307281673, "grad_norm": 0.412109375, "grad_norm_var": 0.0060274759928385414, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.641311526298523, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1807536482810974, "step": 1493 }, { "epoch": 0.02230866289878228, "grad_norm": 0.44921875, "grad_norm_var": 0.006139055887858073, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.45189368724823, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.22211898863315582, "step": 1494 }, { "epoch": 0.022323595069397264, "grad_norm": 0.365234375, "grad_norm_var": 0.005231968561808268, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.4117982387542725, "loss/fcd": 1.2890625, "loss/idx": 7.0, "loss/logits": 0.14883895218372345, "step": 1495 }, { "epoch": 0.022338527240012243, "grad_norm": 0.6171875, "grad_norm_var": 0.006336069107055664, "learning_rate": 0.0001, "loss": 1.7348, "loss/crossentropy": 2.4826900959014893, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.19185729324817657, "step": 1496 }, { "epoch": 0.022353459410627226, "grad_norm": 0.439453125, "grad_norm_var": 0.0065081278483072914, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.655819535255432, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1639203429222107, "step": 1497 }, { "epoch": 0.02236839158124221, "grad_norm": 0.478515625, "grad_norm_var": 0.006497685114542643, "learning_rate": 0.0001, "loss": 1.7682, "loss/crossentropy": 2.4737117290496826, "loss/fcd": 1.5625, "loss/idx": 7.0, "loss/logits": 0.20565108954906464, "step": 1498 }, { "epoch": 0.022383323751857188, "grad_norm": 0.419921875, "grad_norm_var": 0.00646055539449056, "learning_rate": 0.0001, "loss": 1.7136, "loss/crossentropy": 2.436096429824829, "loss/fcd": 1.484375, "loss/idx": 7.0, "loss/logits": 0.22923439741134644, "step": 1499 }, { "epoch": 0.02239825592247217, "grad_norm": 0.44140625, "grad_norm_var": 0.006312799453735351, "learning_rate": 0.0001, "loss": 1.6022, "loss/crossentropy": 2.6730234622955322, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1842484176158905, "step": 1500 }, { "epoch": 0.022413188093087153, "grad_norm": 0.439453125, "grad_norm_var": 0.006403541564941407, "learning_rate": 0.0001, "loss": 1.6707, "loss/crossentropy": 2.520161986351013, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.20194847136735916, "step": 1501 }, { "epoch": 0.022428120263702132, "grad_norm": 0.44140625, "grad_norm_var": 0.006498956680297851, "learning_rate": 0.0001, "loss": 1.6007, "loss/crossentropy": 2.7842148542404175, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1944282054901123, "step": 1502 }, { "epoch": 0.022443052434317115, "grad_norm": 0.48046875, "grad_norm_var": 0.006492471694946289, "learning_rate": 0.0001, "loss": 1.6758, "loss/crossentropy": 2.7076008319854736, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.19530197978019714, "step": 1503 }, { "epoch": 0.022457984604932094, "grad_norm": 0.46875, "grad_norm_var": 0.006391143798828125, "learning_rate": 0.0001, "loss": 1.6203, "loss/crossentropy": 2.8342201709747314, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.19059840589761734, "step": 1504 }, { "epoch": 0.022472916775547077, "grad_norm": 0.46875, "grad_norm_var": 0.006398248672485352, "learning_rate": 0.0001, "loss": 1.7523, "loss/crossentropy": 2.3385519981384277, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.20937157422304153, "step": 1505 }, { "epoch": 0.02248784894616206, "grad_norm": 0.400390625, "grad_norm_var": 0.005142974853515625, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.586688756942749, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.16738758236169815, "step": 1506 }, { "epoch": 0.02250278111677704, "grad_norm": 0.443359375, "grad_norm_var": 0.0029901981353759764, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.3330858945846558, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18151338398456573, "step": 1507 }, { "epoch": 0.02251771328739202, "grad_norm": 0.51171875, "grad_norm_var": 0.0030822118123372396, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.3610141277313232, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.18796861171722412, "step": 1508 }, { "epoch": 0.022532645458007004, "grad_norm": 0.4375, "grad_norm_var": 0.002977863947550456, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.5573190450668335, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.19369368255138397, "step": 1509 }, { "epoch": 0.022547577628621984, "grad_norm": 0.515625, "grad_norm_var": 0.003189706802368164, "learning_rate": 0.0001, "loss": 1.8248, "loss/crossentropy": 2.914300322532654, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.22323106229305267, "step": 1510 }, { "epoch": 0.022562509799236966, "grad_norm": 0.435546875, "grad_norm_var": 0.0026049137115478514, "learning_rate": 0.0001, "loss": 1.5476, "loss/crossentropy": 2.729630470275879, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16865114867687225, "step": 1511 }, { "epoch": 0.02257744196985195, "grad_norm": 0.451171875, "grad_norm_var": 0.0009579976399739583, "learning_rate": 0.0001, "loss": 1.6299, "loss/crossentropy": 2.6284605264663696, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17675121873617172, "step": 1512 }, { "epoch": 0.022592374140466928, "grad_norm": 0.5234375, "grad_norm_var": 0.0012293338775634765, "learning_rate": 0.0001, "loss": 1.8771, "loss/crossentropy": 2.4401475191116333, "loss/fcd": 1.6484375, "loss/idx": 7.0, "loss/logits": 0.22864650189876556, "step": 1513 }, { "epoch": 0.02260730631108191, "grad_norm": 0.48828125, "grad_norm_var": 0.001259613037109375, "learning_rate": 0.0001, "loss": 1.6196, "loss/crossentropy": 2.6447492837905884, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.18992742151021957, "step": 1514 }, { "epoch": 0.022622238481696894, "grad_norm": 0.392578125, "grad_norm_var": 0.0014540990193684896, "learning_rate": 0.0001, "loss": 1.5479, "loss/crossentropy": 2.427100419998169, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16507157683372498, "step": 1515 }, { "epoch": 0.022637170652311873, "grad_norm": 0.39453125, "grad_norm_var": 0.0016997655232747395, "learning_rate": 0.0001, "loss": 1.6024, "loss/crossentropy": 2.499715566635132, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18047834187746048, "step": 1516 }, { "epoch": 0.022652102822926855, "grad_norm": 0.5078125, "grad_norm_var": 0.0018427371978759766, "learning_rate": 0.0001, "loss": 1.7189, "loss/crossentropy": 2.6804046630859375, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.2150314822793007, "step": 1517 }, { "epoch": 0.022667034993541835, "grad_norm": 0.458984375, "grad_norm_var": 0.0018182754516601562, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.502648949623108, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18898165971040726, "step": 1518 }, { "epoch": 0.022681967164156817, "grad_norm": 0.49609375, "grad_norm_var": 0.0018737157185872396, "learning_rate": 0.0001, "loss": 1.619, "loss/crossentropy": 2.562047839164734, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.16982071101665497, "step": 1519 }, { "epoch": 0.0226968993347718, "grad_norm": 0.474609375, "grad_norm_var": 0.0018810113271077475, "learning_rate": 0.0001, "loss": 1.7214, "loss/crossentropy": 2.667098641395569, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.1979236751794815, "step": 1520 }, { "epoch": 0.02271183150538678, "grad_norm": 0.63671875, "grad_norm_var": 0.0037837823232014975, "learning_rate": 0.0001, "loss": 1.818, "loss/crossentropy": 3.0402382612228394, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.22424130141735077, "step": 1521 }, { "epoch": 0.022726763676001762, "grad_norm": 0.455078125, "grad_norm_var": 0.0034410953521728516, "learning_rate": 0.0001, "loss": 1.6237, "loss/crossentropy": 2.690005898475647, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18618075549602509, "step": 1522 }, { "epoch": 0.022741695846616745, "grad_norm": 0.419921875, "grad_norm_var": 0.003578805923461914, "learning_rate": 0.0001, "loss": 1.5849, "loss/crossentropy": 2.6825876235961914, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19041255861520767, "step": 1523 }, { "epoch": 0.022756628017231724, "grad_norm": 0.4453125, "grad_norm_var": 0.0035290877024332684, "learning_rate": 0.0001, "loss": 1.7102, "loss/crossentropy": 2.556373953819275, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19456663727760315, "step": 1524 }, { "epoch": 0.022771560187846707, "grad_norm": 0.99609375, "grad_norm_var": 0.02054874102274577, "learning_rate": 0.0001, "loss": 1.6541, "loss/crossentropy": 2.5576895475387573, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.20096850395202637, "step": 1525 }, { "epoch": 0.02278649235846169, "grad_norm": 0.484375, "grad_norm_var": 0.02056857744852702, "learning_rate": 0.0001, "loss": 1.7278, "loss/crossentropy": 2.709898829460144, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.21212925761938095, "step": 1526 }, { "epoch": 0.02280142452907667, "grad_norm": 0.6953125, "grad_norm_var": 0.02242253621419271, "learning_rate": 0.0001, "loss": 1.7308, "loss/crossentropy": 2.7060130834579468, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.2113034650683403, "step": 1527 }, { "epoch": 0.02281635669969165, "grad_norm": 0.515625, "grad_norm_var": 0.02209051450093587, "learning_rate": 0.0001, "loss": 1.7012, "loss/crossentropy": 2.7240923643112183, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.2090420499444008, "step": 1528 }, { "epoch": 0.02283128887030663, "grad_norm": 0.6015625, "grad_norm_var": 0.022465626398722332, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.8262499570846558, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.21227449923753738, "step": 1529 }, { "epoch": 0.022846221040921613, "grad_norm": 0.392578125, "grad_norm_var": 0.02355677286783854, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.6713192462921143, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.1488954946398735, "step": 1530 }, { "epoch": 0.022861153211536596, "grad_norm": 0.447265625, "grad_norm_var": 0.022793070475260416, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.5342416763305664, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1817667856812477, "step": 1531 }, { "epoch": 0.022876085382151575, "grad_norm": 0.48828125, "grad_norm_var": 0.021694437662760416, "learning_rate": 0.0001, "loss": 1.6303, "loss/crossentropy": 2.8033546209335327, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.20447959005832672, "step": 1532 }, { "epoch": 0.022891017552766558, "grad_norm": 0.5234375, "grad_norm_var": 0.021658833821614584, "learning_rate": 0.0001, "loss": 1.7876, "loss/crossentropy": 2.538213014602661, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2211550772190094, "step": 1533 }, { "epoch": 0.02290594972338154, "grad_norm": 0.4453125, "grad_norm_var": 0.021805810928344726, "learning_rate": 0.0001, "loss": 1.7903, "loss/crossentropy": 2.6131246089935303, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.23564165830612183, "step": 1534 }, { "epoch": 0.02292088189399652, "grad_norm": 0.4375, "grad_norm_var": 0.02230362892150879, "learning_rate": 0.0001, "loss": 1.5504, "loss/crossentropy": 2.6005011796951294, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.17541643232107162, "step": 1535 }, { "epoch": 0.022935814064611502, "grad_norm": 0.5078125, "grad_norm_var": 0.02213312784830729, "learning_rate": 0.0001, "loss": 1.8733, "loss/crossentropy": 2.4328452348709106, "loss/fcd": 1.63671875, "loss/idx": 7.0, "loss/logits": 0.23656418919563293, "step": 1536 }, { "epoch": 0.022950746235226485, "grad_norm": 0.4296875, "grad_norm_var": 0.02188714345296224, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.3456164598464966, "loss/fcd": 1.3359375, "loss/idx": 7.0, "loss/logits": 0.1623057723045349, "step": 1537 }, { "epoch": 0.022965678405841464, "grad_norm": 0.3984375, "grad_norm_var": 0.02256150245666504, "learning_rate": 0.0001, "loss": 1.6528, "loss/crossentropy": 2.3263497352600098, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.18014327436685562, "step": 1538 }, { "epoch": 0.022980610576456447, "grad_norm": 0.40234375, "grad_norm_var": 0.022801971435546874, "learning_rate": 0.0001, "loss": 1.6256, "loss/crossentropy": 2.741413116455078, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18808145076036453, "step": 1539 }, { "epoch": 0.022995542747071426, "grad_norm": 0.4140625, "grad_norm_var": 0.02314580281575521, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.5992331504821777, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.1778806746006012, "step": 1540 }, { "epoch": 0.02301047491768641, "grad_norm": 0.46875, "grad_norm_var": 0.0064345677693684895, "learning_rate": 0.0001, "loss": 1.6861, "loss/crossentropy": 2.645154595375061, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.2056177482008934, "step": 1541 }, { "epoch": 0.02302540708830139, "grad_norm": 0.50390625, "grad_norm_var": 0.00647430419921875, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 3.0556046962738037, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1861376166343689, "step": 1542 }, { "epoch": 0.02304033925891637, "grad_norm": 0.455078125, "grad_norm_var": 0.003168344497680664, "learning_rate": 0.0001, "loss": 1.8426, "loss/crossentropy": 2.5418970584869385, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.2449493408203125, "step": 1543 }, { "epoch": 0.023055271429531354, "grad_norm": 0.427734375, "grad_norm_var": 0.0030517578125, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.835118532180786, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.19205392152071, "step": 1544 }, { "epoch": 0.023070203600146336, "grad_norm": 0.412109375, "grad_norm_var": 0.0016934553782145183, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.5481245517730713, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.15329091250896454, "step": 1545 }, { "epoch": 0.023085135770761316, "grad_norm": 0.44921875, "grad_norm_var": 0.0014818827311197916, "learning_rate": 0.0001, "loss": 1.9153, "loss/crossentropy": 2.778249740600586, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.2590373530983925, "step": 1546 }, { "epoch": 0.023100067941376298, "grad_norm": 0.392578125, "grad_norm_var": 0.0016937255859375, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.767631411552429, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.18715695291757584, "step": 1547 }, { "epoch": 0.02311500011199128, "grad_norm": 0.546875, "grad_norm_var": 0.0022287368774414062, "learning_rate": 0.0001, "loss": 1.8284, "loss/crossentropy": 2.524918556213379, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.21513129025697708, "step": 1548 }, { "epoch": 0.02312993228260626, "grad_norm": 0.44140625, "grad_norm_var": 0.001856231689453125, "learning_rate": 0.0001, "loss": 1.6473, "loss/crossentropy": 2.6472318172454834, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.19806896150112152, "step": 1549 }, { "epoch": 0.023144864453221243, "grad_norm": 0.453125, "grad_norm_var": 0.0018595377604166666, "learning_rate": 0.0001, "loss": 1.565, "loss/crossentropy": 2.3890823125839233, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.16657334566116333, "step": 1550 }, { "epoch": 0.023159796623836226, "grad_norm": 0.408203125, "grad_norm_var": 0.0019475142161051431, "learning_rate": 0.0001, "loss": 1.7483, "loss/crossentropy": 2.484150528907776, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.2170763462781906, "step": 1551 }, { "epoch": 0.023174728794451205, "grad_norm": 0.58203125, "grad_norm_var": 0.0029187361399332684, "learning_rate": 0.0001, "loss": 1.863, "loss/crossentropy": 2.5539822578430176, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.23795197159051895, "step": 1552 }, { "epoch": 0.023189660965066188, "grad_norm": 0.392578125, "grad_norm_var": 0.0031008402506510417, "learning_rate": 0.0001, "loss": 1.4736, "loss/crossentropy": 2.5587610006332397, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.15721678733825684, "step": 1553 }, { "epoch": 0.023204593135681167, "grad_norm": 0.48828125, "grad_norm_var": 0.003026262919108073, "learning_rate": 0.0001, "loss": 1.709, "loss/crossentropy": 2.83974826335907, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19336502254009247, "step": 1554 }, { "epoch": 0.02321952530629615, "grad_norm": 0.46484375, "grad_norm_var": 0.0028533299763997396, "learning_rate": 0.0001, "loss": 1.6188, "loss/crossentropy": 2.6520248651504517, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.1890881061553955, "step": 1555 }, { "epoch": 0.023234457476911132, "grad_norm": 0.51171875, "grad_norm_var": 0.002899424235026042, "learning_rate": 0.0001, "loss": 1.7069, "loss/crossentropy": 2.49362576007843, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.20692860335111618, "step": 1556 }, { "epoch": 0.02324938964752611, "grad_norm": 0.4375, "grad_norm_var": 0.0029340108235677083, "learning_rate": 0.0001, "loss": 1.6047, "loss/crossentropy": 2.834295630455017, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.17499014735221863, "step": 1557 }, { "epoch": 0.023264321818141094, "grad_norm": 0.59765625, "grad_norm_var": 0.004026540120442708, "learning_rate": 0.0001, "loss": 1.8205, "loss/crossentropy": 2.5558085441589355, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.21112027019262314, "step": 1558 }, { "epoch": 0.023279253988756077, "grad_norm": 0.59375, "grad_norm_var": 0.005020761489868164, "learning_rate": 0.0001, "loss": 1.7496, "loss/crossentropy": 2.4718828201293945, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.2105449065566063, "step": 1559 }, { "epoch": 0.023294186159371056, "grad_norm": 0.447265625, "grad_norm_var": 0.004921579360961914, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.622591733932495, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17899316549301147, "step": 1560 }, { "epoch": 0.02330911832998604, "grad_norm": 0.49609375, "grad_norm_var": 0.004644775390625, "learning_rate": 0.0001, "loss": 1.6309, "loss/crossentropy": 2.7571672201156616, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.20515643060207367, "step": 1561 }, { "epoch": 0.02332405050060102, "grad_norm": 0.5625, "grad_norm_var": 0.004960060119628906, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.7672977447509766, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.19848168641328812, "step": 1562 }, { "epoch": 0.023338982671216, "grad_norm": 0.47265625, "grad_norm_var": 0.004336404800415039, "learning_rate": 0.0001, "loss": 1.6023, "loss/crossentropy": 2.674835681915283, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.17654051631689072, "step": 1563 }, { "epoch": 0.023353914841830983, "grad_norm": 0.45703125, "grad_norm_var": 0.004201873143513998, "learning_rate": 0.0001, "loss": 1.5478, "loss/crossentropy": 2.5982450246810913, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16496731340885162, "step": 1564 }, { "epoch": 0.023368847012445963, "grad_norm": 0.5, "grad_norm_var": 0.004053099950154623, "learning_rate": 0.0001, "loss": 1.7248, "loss/crossentropy": 2.3847527503967285, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.2014065459370613, "step": 1565 }, { "epoch": 0.023383779183060945, "grad_norm": 0.462890625, "grad_norm_var": 0.0040089925130208336, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.4995174407958984, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.164415180683136, "step": 1566 }, { "epoch": 0.023398711353675928, "grad_norm": 0.53125, "grad_norm_var": 0.003577407201131185, "learning_rate": 0.0001, "loss": 1.6788, "loss/crossentropy": 2.8225643634796143, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20224997401237488, "step": 1567 }, { "epoch": 0.023413643524290907, "grad_norm": 0.482421875, "grad_norm_var": 0.0031064351399739585, "learning_rate": 0.0001, "loss": 1.6304, "loss/crossentropy": 2.5911120176315308, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1968412771821022, "step": 1568 }, { "epoch": 0.02342857569490589, "grad_norm": 0.451171875, "grad_norm_var": 0.0025313695271809897, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.5418704748153687, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.1935887709259987, "step": 1569 }, { "epoch": 0.023443507865520873, "grad_norm": 0.5, "grad_norm_var": 0.0025258382161458333, "learning_rate": 0.0001, "loss": 1.6968, "loss/crossentropy": 2.611220955848694, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.20849105715751648, "step": 1570 }, { "epoch": 0.023458440036135852, "grad_norm": 0.39453125, "grad_norm_var": 0.0031461079915364584, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.6295387744903564, "loss/fcd": 1.28515625, "loss/idx": 7.0, "loss/logits": 0.16082587838172913, "step": 1571 }, { "epoch": 0.023473372206750834, "grad_norm": 0.451171875, "grad_norm_var": 0.003229379653930664, "learning_rate": 0.0001, "loss": 1.6323, "loss/crossentropy": 2.550940990447998, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17917423695325851, "step": 1572 }, { "epoch": 0.023488304377365817, "grad_norm": 0.419921875, "grad_norm_var": 0.003371429443359375, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.503952383995056, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1895056962966919, "step": 1573 }, { "epoch": 0.023503236547980796, "grad_norm": 0.6171875, "grad_norm_var": 0.0036788304646809896, "learning_rate": 0.0001, "loss": 1.623, "loss/crossentropy": 2.799260377883911, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.17770560830831528, "step": 1574 }, { "epoch": 0.02351816871859578, "grad_norm": 0.400390625, "grad_norm_var": 0.0033405145009358724, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.6628782749176025, "loss/fcd": 1.328125, "loss/idx": 7.0, "loss/logits": 0.15926603972911835, "step": 1575 }, { "epoch": 0.023533100889210762, "grad_norm": 0.5390625, "grad_norm_var": 0.0034921646118164064, "learning_rate": 0.0001, "loss": 1.7877, "loss/crossentropy": 2.4713010787963867, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.21734385192394257, "step": 1576 }, { "epoch": 0.02354803305982574, "grad_norm": 0.404296875, "grad_norm_var": 0.003866434097290039, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.6206711530685425, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.18898864090442657, "step": 1577 }, { "epoch": 0.023562965230440724, "grad_norm": 0.490234375, "grad_norm_var": 0.0033777236938476564, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.5034207105636597, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.18877746164798737, "step": 1578 }, { "epoch": 0.023577897401055703, "grad_norm": 0.53515625, "grad_norm_var": 0.0036157608032226563, "learning_rate": 0.0001, "loss": 2.0868, "loss/crossentropy": 2.7469301223754883, "loss/fcd": 1.80859375, "loss/idx": 7.0, "loss/logits": 0.2782081812620163, "step": 1579 }, { "epoch": 0.023592829571670686, "grad_norm": 0.451171875, "grad_norm_var": 0.003633737564086914, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.4556583166122437, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.18522104620933533, "step": 1580 }, { "epoch": 0.02360776174228567, "grad_norm": 0.46875, "grad_norm_var": 0.003598642349243164, "learning_rate": 0.0001, "loss": 1.7168, "loss/crossentropy": 2.6500853300094604, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.1973070204257965, "step": 1581 }, { "epoch": 0.023622693912900648, "grad_norm": 0.447265625, "grad_norm_var": 0.003639078140258789, "learning_rate": 0.0001, "loss": 1.7504, "loss/crossentropy": 2.4983482360839844, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.22697453200817108, "step": 1582 }, { "epoch": 0.02363762608351563, "grad_norm": 0.515625, "grad_norm_var": 0.0035350640614827474, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.6754297018051147, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.17720526456832886, "step": 1583 }, { "epoch": 0.023652558254130613, "grad_norm": 0.43359375, "grad_norm_var": 0.003622881571451823, "learning_rate": 0.0001, "loss": 1.6579, "loss/crossentropy": 2.661411762237549, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.20476175099611282, "step": 1584 }, { "epoch": 0.023667490424745592, "grad_norm": 0.458984375, "grad_norm_var": 0.0036071141560872397, "learning_rate": 0.0001, "loss": 1.5381, "loss/crossentropy": 2.626299500465393, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.17094596475362778, "step": 1585 }, { "epoch": 0.023682422595360575, "grad_norm": 0.443359375, "grad_norm_var": 0.0035845279693603516, "learning_rate": 0.0001, "loss": 1.5842, "loss/crossentropy": 2.8080815076828003, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1818404197692871, "step": 1586 }, { "epoch": 0.023697354765975558, "grad_norm": 0.4296875, "grad_norm_var": 0.003322458267211914, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.596649646759033, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17525531351566315, "step": 1587 }, { "epoch": 0.023712286936590537, "grad_norm": 0.55078125, "grad_norm_var": 0.003704261779785156, "learning_rate": 0.0001, "loss": 1.7572, "loss/crossentropy": 2.7423954010009766, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.21033258736133575, "step": 1588 }, { "epoch": 0.02372721910720552, "grad_norm": 0.4921875, "grad_norm_var": 0.003496662775675456, "learning_rate": 0.0001, "loss": 1.6647, "loss/crossentropy": 2.879364013671875, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.20377619564533234, "step": 1589 }, { "epoch": 0.0237421512778205, "grad_norm": 0.5078125, "grad_norm_var": 0.002241627375284831, "learning_rate": 0.0001, "loss": 1.8962, "loss/crossentropy": 2.434022903442383, "loss/fcd": 1.6484375, "loss/idx": 7.0, "loss/logits": 0.24780651926994324, "step": 1590 }, { "epoch": 0.02375708344843548, "grad_norm": 0.6796875, "grad_norm_var": 0.004412269592285157, "learning_rate": 0.0001, "loss": 1.8758, "loss/crossentropy": 2.5361167192459106, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.21565672755241394, "step": 1591 }, { "epoch": 0.023772015619050464, "grad_norm": 0.41796875, "grad_norm_var": 0.004544321695963542, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.738878607749939, "loss/fcd": 1.33984375, "loss/idx": 7.0, "loss/logits": 0.16762951761484146, "step": 1592 }, { "epoch": 0.023786947789665443, "grad_norm": 0.486328125, "grad_norm_var": 0.0041050593058268225, "learning_rate": 0.0001, "loss": 1.7113, "loss/crossentropy": 2.573231339454651, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.20347873866558075, "step": 1593 }, { "epoch": 0.023801879960280426, "grad_norm": 0.44140625, "grad_norm_var": 0.004239765803019205, "learning_rate": 0.0001, "loss": 1.6489, "loss/crossentropy": 2.4706650972366333, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1958089843392372, "step": 1594 }, { "epoch": 0.02381681213089541, "grad_norm": 0.453125, "grad_norm_var": 0.004111591974894206, "learning_rate": 0.0001, "loss": 1.6444, "loss/crossentropy": 2.6910659074783325, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.17950844019651413, "step": 1595 }, { "epoch": 0.023831744301510388, "grad_norm": 0.470703125, "grad_norm_var": 0.004060729344685873, "learning_rate": 0.0001, "loss": 1.6975, "loss/crossentropy": 2.5878102779388428, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.19749021530151367, "step": 1596 }, { "epoch": 0.02384667647212537, "grad_norm": 0.484375, "grad_norm_var": 0.004050302505493164, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.41681444644928, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.17812514305114746, "step": 1597 }, { "epoch": 0.023861608642740353, "grad_norm": 0.4140625, "grad_norm_var": 0.004273223876953125, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.4718927145004272, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.1648888885974884, "step": 1598 }, { "epoch": 0.023876540813355333, "grad_norm": 0.63671875, "grad_norm_var": 0.005765215555826823, "learning_rate": 0.0001, "loss": 1.9009, "loss/crossentropy": 2.596011519432068, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.24071332067251205, "step": 1599 }, { "epoch": 0.023891472983970315, "grad_norm": 0.474609375, "grad_norm_var": 0.005575291315714518, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.4079898595809937, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18339631706476212, "step": 1600 }, { "epoch": 0.023906405154585295, "grad_norm": 0.44921875, "grad_norm_var": 0.005621782938639323, "learning_rate": 0.0001, "loss": 1.6684, "loss/crossentropy": 2.572506904602051, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.19576621800661087, "step": 1601 }, { "epoch": 0.023921337325200277, "grad_norm": 0.484375, "grad_norm_var": 0.005474583307902018, "learning_rate": 0.0001, "loss": 1.9182, "loss/crossentropy": 2.5226889848709106, "loss/fcd": 1.671875, "loss/idx": 7.0, "loss/logits": 0.24628648161888123, "step": 1602 }, { "epoch": 0.02393626949581526, "grad_norm": 0.3984375, "grad_norm_var": 0.005795526504516602, "learning_rate": 0.0001, "loss": 1.6642, "loss/crossentropy": 2.602216124534607, "loss/fcd": 1.45703125, "loss/idx": 7.0, "loss/logits": 0.20719221979379654, "step": 1603 }, { "epoch": 0.02395120166643024, "grad_norm": 0.50390625, "grad_norm_var": 0.005553674697875976, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.4609646797180176, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.1987275555729866, "step": 1604 }, { "epoch": 0.023966133837045222, "grad_norm": 0.5546875, "grad_norm_var": 0.00583952267964681, "learning_rate": 0.0001, "loss": 1.687, "loss/crossentropy": 2.5550639629364014, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.1792164072394371, "step": 1605 }, { "epoch": 0.023981066007660205, "grad_norm": 0.40625, "grad_norm_var": 0.00625774065653483, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.5962554216384888, "loss/fcd": 1.2421875, "loss/idx": 7.0, "loss/logits": 0.13753189146518707, "step": 1606 }, { "epoch": 0.023995998178275184, "grad_norm": 0.458984375, "grad_norm_var": 0.003565406799316406, "learning_rate": 0.0001, "loss": 1.67, "loss/crossentropy": 2.744762420654297, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.19737379252910614, "step": 1607 }, { "epoch": 0.024010930348890167, "grad_norm": 0.380859375, "grad_norm_var": 0.003913609186808268, "learning_rate": 0.0001, "loss": 1.5864, "loss/crossentropy": 2.4816367626190186, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.17619794607162476, "step": 1608 }, { "epoch": 0.02402586251950515, "grad_norm": 0.45703125, "grad_norm_var": 0.0038981119791666667, "learning_rate": 0.0001, "loss": 1.679, "loss/crossentropy": 2.2741174697875977, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.19848152250051498, "step": 1609 }, { "epoch": 0.02404079469012013, "grad_norm": 0.376953125, "grad_norm_var": 0.004375950495402018, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.5791796445846558, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19519494473934174, "step": 1610 }, { "epoch": 0.02405572686073511, "grad_norm": 0.44140625, "grad_norm_var": 0.004399601618448893, "learning_rate": 0.0001, "loss": 1.6222, "loss/crossentropy": 2.7612199783325195, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.1729477047920227, "step": 1611 }, { "epoch": 0.024070659031350094, "grad_norm": 0.416015625, "grad_norm_var": 0.004523324966430664, "learning_rate": 0.0001, "loss": 1.6367, "loss/crossentropy": 2.3753793239593506, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.18361905962228775, "step": 1612 }, { "epoch": 0.024085591201965073, "grad_norm": 0.46484375, "grad_norm_var": 0.00448009173075358, "learning_rate": 0.0001, "loss": 1.5807, "loss/crossentropy": 2.741746187210083, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17443695664405823, "step": 1613 }, { "epoch": 0.024100523372580056, "grad_norm": 0.474609375, "grad_norm_var": 0.0043593724568684895, "learning_rate": 0.0001, "loss": 1.7266, "loss/crossentropy": 2.6350860595703125, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.21098782122135162, "step": 1614 }, { "epoch": 0.024115455543195035, "grad_norm": 0.490234375, "grad_norm_var": 0.0022720177968343098, "learning_rate": 0.0001, "loss": 1.6602, "loss/crossentropy": 2.6208122968673706, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.1992257758975029, "step": 1615 }, { "epoch": 0.024130387713810018, "grad_norm": 0.51171875, "grad_norm_var": 0.0024698257446289064, "learning_rate": 0.0001, "loss": 1.7546, "loss/crossentropy": 2.395472764968872, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.21557673066854477, "step": 1616 }, { "epoch": 0.024145319884425, "grad_norm": 0.392578125, "grad_norm_var": 0.002709054946899414, "learning_rate": 0.0001, "loss": 1.6455, "loss/crossentropy": 2.7418118715286255, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.19631459563970566, "step": 1617 }, { "epoch": 0.02416025205503998, "grad_norm": 0.453125, "grad_norm_var": 0.002630217870076497, "learning_rate": 0.0001, "loss": 1.7756, "loss/crossentropy": 2.547677159309387, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.21696195006370544, "step": 1618 }, { "epoch": 0.024175184225654962, "grad_norm": 0.455078125, "grad_norm_var": 0.0024499893188476562, "learning_rate": 0.0001, "loss": 1.7295, "loss/crossentropy": 2.486993432044983, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.20217935740947723, "step": 1619 }, { "epoch": 0.024190116396269945, "grad_norm": 0.447265625, "grad_norm_var": 0.0022614638010660807, "learning_rate": 0.0001, "loss": 1.6641, "loss/crossentropy": 2.610501766204834, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.1914554387331009, "step": 1620 }, { "epoch": 0.024205048566884924, "grad_norm": 0.79296875, "grad_norm_var": 0.009172550837198893, "learning_rate": 0.0001, "loss": 1.8343, "loss/crossentropy": 2.6146063804626465, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.209325909614563, "step": 1621 }, { "epoch": 0.024219980737499907, "grad_norm": 0.46875, "grad_norm_var": 0.008937565485636394, "learning_rate": 0.0001, "loss": 1.8125, "loss/crossentropy": 2.703667640686035, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.2187758833169937, "step": 1622 }, { "epoch": 0.02423491290811489, "grad_norm": 0.458984375, "grad_norm_var": 0.008937565485636394, "learning_rate": 0.0001, "loss": 1.7127, "loss/crossentropy": 2.48435115814209, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.2049325406551361, "step": 1623 }, { "epoch": 0.02424984507872987, "grad_norm": 0.427734375, "grad_norm_var": 0.008532444636027018, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.588418126106262, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.1641799509525299, "step": 1624 }, { "epoch": 0.02426477724934485, "grad_norm": 0.5, "grad_norm_var": 0.008570210138956705, "learning_rate": 0.0001, "loss": 1.5556, "loss/crossentropy": 2.860697865486145, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.17278952151536942, "step": 1625 }, { "epoch": 0.02427970941995983, "grad_norm": 0.51171875, "grad_norm_var": 0.007974688212076824, "learning_rate": 0.0001, "loss": 1.776, "loss/crossentropy": 2.5550949573516846, "loss/fcd": 1.55078125, "loss/idx": 7.0, "loss/logits": 0.22521134465932846, "step": 1626 }, { "epoch": 0.024294641590574814, "grad_norm": 0.87109375, "grad_norm_var": 0.017206255594889322, "learning_rate": 0.0001, "loss": 1.8529, "loss/crossentropy": 2.4149361848831177, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.1966421902179718, "step": 1627 }, { "epoch": 0.024309573761189796, "grad_norm": 0.61328125, "grad_norm_var": 0.017204650243123374, "learning_rate": 0.0001, "loss": 1.8448, "loss/crossentropy": 2.2909621596336365, "loss/fcd": 1.62109375, "loss/idx": 7.0, "loss/logits": 0.22369665652513504, "step": 1628 }, { "epoch": 0.024324505931804775, "grad_norm": 0.447265625, "grad_norm_var": 0.01735528310139974, "learning_rate": 0.0001, "loss": 1.5968, "loss/crossentropy": 2.7739763259887695, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18666711449623108, "step": 1629 }, { "epoch": 0.024339438102419758, "grad_norm": 0.5, "grad_norm_var": 0.0172426700592041, "learning_rate": 0.0001, "loss": 1.7953, "loss/crossentropy": 2.588120698928833, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2289050817489624, "step": 1630 }, { "epoch": 0.02435437027303474, "grad_norm": 0.408203125, "grad_norm_var": 0.01800370216369629, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.6186710596084595, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.17625487595796585, "step": 1631 }, { "epoch": 0.02436930244364972, "grad_norm": 0.8359375, "grad_norm_var": 0.024378315607706705, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.7367137670516968, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.20829469710588455, "step": 1632 }, { "epoch": 0.024384234614264703, "grad_norm": 0.447265625, "grad_norm_var": 0.02351581255594889, "learning_rate": 0.0001, "loss": 1.748, "loss/crossentropy": 2.6904454231262207, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.2323940247297287, "step": 1633 }, { "epoch": 0.024399166784879685, "grad_norm": 0.71875, "grad_norm_var": 0.02485171953837077, "learning_rate": 0.0001, "loss": 1.8111, "loss/crossentropy": 2.787333369255066, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.22514434158802032, "step": 1634 }, { "epoch": 0.024414098955494665, "grad_norm": 0.4453125, "grad_norm_var": 0.024989763895670574, "learning_rate": 0.0001, "loss": 1.5703, "loss/crossentropy": 2.5433605909347534, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1679105907678604, "step": 1635 }, { "epoch": 0.024429031126109647, "grad_norm": 0.41796875, "grad_norm_var": 0.025467793146769207, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.6077455282211304, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1736254319548607, "step": 1636 }, { "epoch": 0.02444396329672463, "grad_norm": 0.48046875, "grad_norm_var": 0.02161749203999837, "learning_rate": 0.0001, "loss": 1.7188, "loss/crossentropy": 2.7809062004089355, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.1992294043302536, "step": 1637 }, { "epoch": 0.02445889546733961, "grad_norm": 0.45703125, "grad_norm_var": 0.021728881200154624, "learning_rate": 0.0001, "loss": 1.622, "loss/crossentropy": 2.7618008852005005, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.20014109462499619, "step": 1638 }, { "epoch": 0.024473827637954592, "grad_norm": 0.455078125, "grad_norm_var": 0.021768808364868164, "learning_rate": 0.0001, "loss": 1.6655, "loss/crossentropy": 2.623804807662964, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.1967843845486641, "step": 1639 }, { "epoch": 0.02448875980856957, "grad_norm": 0.462890625, "grad_norm_var": 0.02134995460510254, "learning_rate": 0.0001, "loss": 1.6911, "loss/crossentropy": 2.4398980140686035, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.20278441905975342, "step": 1640 }, { "epoch": 0.024503691979184554, "grad_norm": 0.4453125, "grad_norm_var": 0.021797672907511393, "learning_rate": 0.0001, "loss": 1.742, "loss/crossentropy": 2.558762311935425, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.21071960031986237, "step": 1641 }, { "epoch": 0.024518624149799537, "grad_norm": 0.466796875, "grad_norm_var": 0.02204736073811849, "learning_rate": 0.0001, "loss": 1.6663, "loss/crossentropy": 2.6475770473480225, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.17798765748739243, "step": 1642 }, { "epoch": 0.024533556320414516, "grad_norm": 0.40234375, "grad_norm_var": 0.01443322499593099, "learning_rate": 0.0001, "loss": 1.4809, "loss/crossentropy": 2.598748207092285, "loss/fcd": 1.30859375, "loss/idx": 7.0, "loss/logits": 0.172304205596447, "step": 1643 }, { "epoch": 0.0245484884910295, "grad_norm": 0.458984375, "grad_norm_var": 0.01359569231669108, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.6750913858413696, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.1672048419713974, "step": 1644 }, { "epoch": 0.02456342066164448, "grad_norm": 0.61328125, "grad_norm_var": 0.014359029134114583, "learning_rate": 0.0001, "loss": 2.1439, "loss/crossentropy": 2.3801056146621704, "loss/fcd": 1.8671875, "loss/idx": 7.0, "loss/logits": 0.27668944001197815, "step": 1645 }, { "epoch": 0.02457835283225946, "grad_norm": 0.48046875, "grad_norm_var": 0.014385414123535157, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.666161894798279, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17872051894664764, "step": 1646 }, { "epoch": 0.024593285002874443, "grad_norm": 0.40625, "grad_norm_var": 0.014409494400024415, "learning_rate": 0.0001, "loss": 1.6064, "loss/crossentropy": 2.5731054544448853, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.17671120166778564, "step": 1647 }, { "epoch": 0.024608217173489426, "grad_norm": 0.4375, "grad_norm_var": 0.006465387344360351, "learning_rate": 0.0001, "loss": 1.5815, "loss/crossentropy": 2.5400450229644775, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1869390457868576, "step": 1648 }, { "epoch": 0.024623149344104405, "grad_norm": 0.4296875, "grad_norm_var": 0.006549072265625, "learning_rate": 0.0001, "loss": 1.6781, "loss/crossentropy": 2.5286325216293335, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.21328392624855042, "step": 1649 }, { "epoch": 0.024638081514719388, "grad_norm": 0.43359375, "grad_norm_var": 0.0023116429646809896, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.507314920425415, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.1893395036458969, "step": 1650 }, { "epoch": 0.024653013685334367, "grad_norm": 0.419921875, "grad_norm_var": 0.0023874759674072264, "learning_rate": 0.0001, "loss": 1.6497, "loss/crossentropy": 2.6344709396362305, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.20828261971473694, "step": 1651 }, { "epoch": 0.02466794585594935, "grad_norm": 0.390625, "grad_norm_var": 0.0025663852691650392, "learning_rate": 0.0001, "loss": 1.6056, "loss/crossentropy": 2.3970154523849487, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1837209090590477, "step": 1652 }, { "epoch": 0.024682878026564332, "grad_norm": 0.51171875, "grad_norm_var": 0.0027438958485921225, "learning_rate": 0.0001, "loss": 1.7946, "loss/crossentropy": 2.633846640586853, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.23988796770572662, "step": 1653 }, { "epoch": 0.02469781019717931, "grad_norm": 0.91015625, "grad_norm_var": 0.015731414159138996, "learning_rate": 0.0001, "loss": 1.7568, "loss/crossentropy": 2.801383137702942, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.22162751853466034, "step": 1654 }, { "epoch": 0.024712742367794294, "grad_norm": 0.44921875, "grad_norm_var": 0.015755208333333333, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.677790641784668, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.16054877638816833, "step": 1655 }, { "epoch": 0.024727674538409277, "grad_norm": 0.498046875, "grad_norm_var": 0.01574090321858724, "learning_rate": 0.0001, "loss": 1.5935, "loss/crossentropy": 2.7032641172409058, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.17940928041934967, "step": 1656 }, { "epoch": 0.024742606709024256, "grad_norm": 0.515625, "grad_norm_var": 0.01568139394124349, "learning_rate": 0.0001, "loss": 1.7322, "loss/crossentropy": 2.6352341175079346, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.22824689745903015, "step": 1657 }, { "epoch": 0.02475753887963924, "grad_norm": 0.51953125, "grad_norm_var": 0.015698989232381184, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.6113274097442627, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1807587519288063, "step": 1658 }, { "epoch": 0.02477247105025422, "grad_norm": 0.53125, "grad_norm_var": 0.015191253026326497, "learning_rate": 0.0001, "loss": 1.9066, "loss/crossentropy": 2.7532520294189453, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.26209019124507904, "step": 1659 }, { "epoch": 0.0247874032208692, "grad_norm": 0.703125, "grad_norm_var": 0.017569478352864584, "learning_rate": 0.0001, "loss": 1.9952, "loss/crossentropy": 2.48457670211792, "loss/fcd": 1.75390625, "loss/idx": 7.0, "loss/logits": 0.2413245141506195, "step": 1660 }, { "epoch": 0.024802335391484184, "grad_norm": 0.3984375, "grad_norm_var": 0.017656898498535155, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.5506786108016968, "loss/fcd": 1.2421875, "loss/idx": 7.0, "loss/logits": 0.13581915944814682, "step": 1661 }, { "epoch": 0.024817267562099163, "grad_norm": 0.443359375, "grad_norm_var": 0.017850478490193684, "learning_rate": 0.0001, "loss": 1.6875, "loss/crossentropy": 2.6211588382720947, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.20701348036527634, "step": 1662 }, { "epoch": 0.024832199732714146, "grad_norm": 0.43359375, "grad_norm_var": 0.017555856704711915, "learning_rate": 0.0001, "loss": 1.6533, "loss/crossentropy": 2.3935290575027466, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.18453489243984222, "step": 1663 }, { "epoch": 0.024847131903329128, "grad_norm": 0.447265625, "grad_norm_var": 0.017478370666503908, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.560220956802368, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1809101179242134, "step": 1664 }, { "epoch": 0.024862064073944107, "grad_norm": 0.6015625, "grad_norm_var": 0.017663002014160156, "learning_rate": 0.0001, "loss": 1.8313, "loss/crossentropy": 2.6589072942733765, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2649317979812622, "step": 1665 }, { "epoch": 0.02487699624455909, "grad_norm": 0.458984375, "grad_norm_var": 0.01743467648824056, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.545647144317627, "loss/fcd": 1.484375, "loss/idx": 7.0, "loss/logits": 0.20625202357769012, "step": 1666 }, { "epoch": 0.024891928415174073, "grad_norm": 0.416015625, "grad_norm_var": 0.01748490333557129, "learning_rate": 0.0001, "loss": 1.5305, "loss/crossentropy": 2.803003668785095, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.17893870919942856, "step": 1667 }, { "epoch": 0.024906860585789052, "grad_norm": 0.5390625, "grad_norm_var": 0.016414626439412435, "learning_rate": 0.0001, "loss": 1.6153, "loss/crossentropy": 2.668912410736084, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.16222340613603592, "step": 1668 }, { "epoch": 0.024921792756404035, "grad_norm": 0.404296875, "grad_norm_var": 0.017305437723795572, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.715358018875122, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.19830583781003952, "step": 1669 }, { "epoch": 0.024936724927019017, "grad_norm": 0.451171875, "grad_norm_var": 0.006402317682902018, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.7629430294036865, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.16647836565971375, "step": 1670 }, { "epoch": 0.024951657097633997, "grad_norm": 0.408203125, "grad_norm_var": 0.006720415751139323, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.637349843978882, "loss/fcd": 1.3046875, "loss/idx": 7.0, "loss/logits": 0.15592695027589798, "step": 1671 }, { "epoch": 0.02496658926824898, "grad_norm": 0.50390625, "grad_norm_var": 0.006732288996378581, "learning_rate": 0.0001, "loss": 1.6183, "loss/crossentropy": 2.631614923477173, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18078526109457016, "step": 1672 }, { "epoch": 0.024981521438863962, "grad_norm": 0.48828125, "grad_norm_var": 0.0066708723704020185, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.4787803888320923, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17176026105880737, "step": 1673 }, { "epoch": 0.02499645360947894, "grad_norm": 0.458984375, "grad_norm_var": 0.006615193684895834, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.4326345920562744, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1936572641134262, "step": 1674 }, { "epoch": 0.025011385780093924, "grad_norm": 0.482421875, "grad_norm_var": 0.006433598200480143, "learning_rate": 0.0001, "loss": 1.646, "loss/crossentropy": 2.4791451692581177, "loss/fcd": 1.45703125, "loss/idx": 7.0, "loss/logits": 0.1889389008283615, "step": 1675 }, { "epoch": 0.025026317950708903, "grad_norm": 0.74609375, "grad_norm_var": 0.007842111587524413, "learning_rate": 0.0001, "loss": 1.9245, "loss/crossentropy": 2.682641386985779, "loss/fcd": 1.68359375, "loss/idx": 7.0, "loss/logits": 0.24086102843284607, "step": 1676 }, { "epoch": 0.025041250121323886, "grad_norm": 0.42578125, "grad_norm_var": 0.007591104507446289, "learning_rate": 0.0001, "loss": 1.6518, "loss/crossentropy": 2.6392557621002197, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.20261266082525253, "step": 1677 }, { "epoch": 0.02505618229193887, "grad_norm": 0.416015625, "grad_norm_var": 0.007778024673461914, "learning_rate": 0.0001, "loss": 1.6168, "loss/crossentropy": 2.868458390235901, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1949130743741989, "step": 1678 }, { "epoch": 0.025071114462553848, "grad_norm": 0.55078125, "grad_norm_var": 0.007909631729125977, "learning_rate": 0.0001, "loss": 1.6456, "loss/crossentropy": 2.4708521366119385, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.17297638207674026, "step": 1679 }, { "epoch": 0.02508604663316883, "grad_norm": 0.5078125, "grad_norm_var": 0.007814534505208333, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.543753504753113, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.214086152613163, "step": 1680 }, { "epoch": 0.025100978803783813, "grad_norm": 0.51171875, "grad_norm_var": 0.006997108459472656, "learning_rate": 0.0001, "loss": 1.625, "loss/crossentropy": 2.4634329080581665, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.20312698930501938, "step": 1681 }, { "epoch": 0.025115910974398793, "grad_norm": 0.609375, "grad_norm_var": 0.007877079645792644, "learning_rate": 0.0001, "loss": 1.8753, "loss/crossentropy": 2.5432770252227783, "loss/fcd": 1.65234375, "loss/idx": 7.0, "loss/logits": 0.22290775179862976, "step": 1682 }, { "epoch": 0.025130843145013775, "grad_norm": 0.451171875, "grad_norm_var": 0.007584110895792643, "learning_rate": 0.0001, "loss": 1.6209, "loss/crossentropy": 2.307678699493408, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18343796581029892, "step": 1683 }, { "epoch": 0.025145775315628758, "grad_norm": 0.50390625, "grad_norm_var": 0.007465092341105143, "learning_rate": 0.0001, "loss": 1.8394, "loss/crossentropy": 2.988747477531433, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.22612474113702774, "step": 1684 }, { "epoch": 0.025160707486243737, "grad_norm": 0.38671875, "grad_norm_var": 0.007696978251139323, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.45614755153656, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1672608107328415, "step": 1685 }, { "epoch": 0.02517563965685872, "grad_norm": 0.458984375, "grad_norm_var": 0.007656288146972656, "learning_rate": 0.0001, "loss": 1.884, "loss/crossentropy": 2.2561362981796265, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.25896310061216354, "step": 1686 }, { "epoch": 0.0251905718274737, "grad_norm": 0.45703125, "grad_norm_var": 0.007244221369425456, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.6483774185180664, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16627098619937897, "step": 1687 }, { "epoch": 0.025205503998088682, "grad_norm": 0.5, "grad_norm_var": 0.00724180539449056, "learning_rate": 0.0001, "loss": 1.8459, "loss/crossentropy": 2.4387799501419067, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.2599783390760422, "step": 1688 }, { "epoch": 0.025220436168703664, "grad_norm": 0.55078125, "grad_norm_var": 0.007411686579386393, "learning_rate": 0.0001, "loss": 1.9159, "loss/crossentropy": 2.43993878364563, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.2596488744020462, "step": 1689 }, { "epoch": 0.025235368339318644, "grad_norm": 0.396484375, "grad_norm_var": 0.008006779352823894, "learning_rate": 0.0001, "loss": 1.5128, "loss/crossentropy": 2.69704270362854, "loss/fcd": 1.34375, "loss/idx": 7.0, "loss/logits": 0.16903749108314514, "step": 1690 }, { "epoch": 0.025250300509933626, "grad_norm": 0.546875, "grad_norm_var": 0.008139483133951823, "learning_rate": 0.0001, "loss": 1.6848, "loss/crossentropy": 2.6188812255859375, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.18483464419841766, "step": 1691 }, { "epoch": 0.02526523268054861, "grad_norm": 0.412109375, "grad_norm_var": 0.0042065779368082685, "learning_rate": 0.0001, "loss": 1.6088, "loss/crossentropy": 2.7869977951049805, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.19087275862693787, "step": 1692 }, { "epoch": 0.02528016485116359, "grad_norm": 0.6015625, "grad_norm_var": 0.0048588911692301435, "learning_rate": 0.0001, "loss": 1.6886, "loss/crossentropy": 2.6109875440597534, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.2120508998632431, "step": 1693 }, { "epoch": 0.02529509702177857, "grad_norm": 0.51171875, "grad_norm_var": 0.004470252990722656, "learning_rate": 0.0001, "loss": 1.5881, "loss/crossentropy": 2.454450845718384, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.16236257553100586, "step": 1694 }, { "epoch": 0.025310029192393554, "grad_norm": 0.41796875, "grad_norm_var": 0.004625892639160157, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.6954513788223267, "loss/fcd": 1.26953125, "loss/idx": 7.0, "loss/logits": 0.15342209488153458, "step": 1695 }, { "epoch": 0.025324961363008533, "grad_norm": 0.466796875, "grad_norm_var": 0.004628229141235352, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.597328305244446, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.17098169773817062, "step": 1696 }, { "epoch": 0.025339893533623516, "grad_norm": 0.47265625, "grad_norm_var": 0.004591989517211914, "learning_rate": 0.0001, "loss": 1.6754, "loss/crossentropy": 2.6136374473571777, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.1988101452589035, "step": 1697 }, { "epoch": 0.0253548257042385, "grad_norm": 0.47265625, "grad_norm_var": 0.0034749190012613933, "learning_rate": 0.0001, "loss": 1.6458, "loss/crossentropy": 2.4211668968200684, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.192709781229496, "step": 1698 }, { "epoch": 0.025369757874853478, "grad_norm": 0.48828125, "grad_norm_var": 0.0034407933553059896, "learning_rate": 0.0001, "loss": 1.8869, "loss/crossentropy": 2.555441379547119, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.24241416156291962, "step": 1699 }, { "epoch": 0.02538469004546846, "grad_norm": 0.453125, "grad_norm_var": 0.0034250895182291667, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.854443907737732, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.2057504653930664, "step": 1700 }, { "epoch": 0.02539962221608344, "grad_norm": 0.63671875, "grad_norm_var": 0.004401652018229166, "learning_rate": 0.0001, "loss": 1.8133, "loss/crossentropy": 2.7695144414901733, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2468656525015831, "step": 1701 }, { "epoch": 0.025414554386698422, "grad_norm": 0.42578125, "grad_norm_var": 0.004608901341756185, "learning_rate": 0.0001, "loss": 1.6345, "loss/crossentropy": 2.57673442363739, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18916824460029602, "step": 1702 }, { "epoch": 0.025429486557313405, "grad_norm": 0.439453125, "grad_norm_var": 0.00470116933186849, "learning_rate": 0.0001, "loss": 1.6128, "loss/crossentropy": 2.578126311302185, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1870439350605011, "step": 1703 }, { "epoch": 0.025444418727928384, "grad_norm": 0.51953125, "grad_norm_var": 0.004758707682291667, "learning_rate": 0.0001, "loss": 1.6175, "loss/crossentropy": 2.347869634628296, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.164364293217659, "step": 1704 }, { "epoch": 0.025459350898543367, "grad_norm": 0.455078125, "grad_norm_var": 0.004533624649047852, "learning_rate": 0.0001, "loss": 1.733, "loss/crossentropy": 2.692792534828186, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.22913546860218048, "step": 1705 }, { "epoch": 0.02547428306915835, "grad_norm": 0.6875, "grad_norm_var": 0.006496938069661459, "learning_rate": 0.0001, "loss": 1.8966, "loss/crossentropy": 2.581258535385132, "loss/fcd": 1.671875, "loss/idx": 7.0, "loss/logits": 0.22475934028625488, "step": 1706 }, { "epoch": 0.02548921523977333, "grad_norm": 0.41796875, "grad_norm_var": 0.00673821767171224, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.708857297897339, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.18387839198112488, "step": 1707 }, { "epoch": 0.02550414741038831, "grad_norm": 0.46875, "grad_norm_var": 0.006332127253214518, "learning_rate": 0.0001, "loss": 1.5831, "loss/crossentropy": 2.8452978134155273, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.18468762934207916, "step": 1708 }, { "epoch": 0.025519079581003294, "grad_norm": 0.41015625, "grad_norm_var": 0.005927133560180664, "learning_rate": 0.0001, "loss": 1.684, "loss/crossentropy": 2.57491397857666, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.21522057056427002, "step": 1709 }, { "epoch": 0.025534011751618273, "grad_norm": 0.4765625, "grad_norm_var": 0.005874490737915039, "learning_rate": 0.0001, "loss": 1.7576, "loss/crossentropy": 2.5990965366363525, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.22246869653463364, "step": 1710 }, { "epoch": 0.025548943922233256, "grad_norm": 0.423828125, "grad_norm_var": 0.005826759338378906, "learning_rate": 0.0001, "loss": 1.6086, "loss/crossentropy": 2.581049919128418, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18676753342151642, "step": 1711 }, { "epoch": 0.025563876092848235, "grad_norm": 0.478515625, "grad_norm_var": 0.005811309814453125, "learning_rate": 0.0001, "loss": 1.6942, "loss/crossentropy": 2.5850048065185547, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.1942325085401535, "step": 1712 }, { "epoch": 0.025578808263463218, "grad_norm": 0.43359375, "grad_norm_var": 0.0059600830078125, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.736702561378479, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1857914999127388, "step": 1713 }, { "epoch": 0.0255937404340782, "grad_norm": 0.4765625, "grad_norm_var": 0.005956967671712239, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.79103684425354, "loss/fcd": 1.234375, "loss/idx": 7.0, "loss/logits": 0.14638280868530273, "step": 1714 }, { "epoch": 0.02560867260469318, "grad_norm": 0.4375, "grad_norm_var": 0.00606689453125, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.5501586198806763, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16254615038633347, "step": 1715 }, { "epoch": 0.025623604775308163, "grad_norm": 0.47265625, "grad_norm_var": 0.00602715810139974, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.6981340646743774, "loss/fcd": 1.33984375, "loss/idx": 7.0, "loss/logits": 0.17272990942001343, "step": 1716 }, { "epoch": 0.025638536945923145, "grad_norm": 0.53125, "grad_norm_var": 0.0045010884602864586, "learning_rate": 0.0001, "loss": 1.7696, "loss/crossentropy": 2.4307806491851807, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.19927946478128433, "step": 1717 }, { "epoch": 0.025653469116538125, "grad_norm": 0.53125, "grad_norm_var": 0.00454400380452474, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.473902702331543, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.1577570140361786, "step": 1718 }, { "epoch": 0.025668401287153107, "grad_norm": 0.404296875, "grad_norm_var": 0.004805501302083333, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.475058436393738, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.20222157984972, "step": 1719 }, { "epoch": 0.02568333345776809, "grad_norm": 0.390625, "grad_norm_var": 0.00510552724202474, "learning_rate": 0.0001, "loss": 1.5999, "loss/crossentropy": 2.394508481025696, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.18192025274038315, "step": 1720 }, { "epoch": 0.02569826562838307, "grad_norm": 0.5, "grad_norm_var": 0.005151224136352539, "learning_rate": 0.0001, "loss": 1.6383, "loss/crossentropy": 2.6393444538116455, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.20466701686382294, "step": 1721 }, { "epoch": 0.025713197798998052, "grad_norm": 0.439453125, "grad_norm_var": 0.001846758524576823, "learning_rate": 0.0001, "loss": 1.6449, "loss/crossentropy": 2.59916889667511, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.20742526650428772, "step": 1722 }, { "epoch": 0.02572812996961303, "grad_norm": 0.376953125, "grad_norm_var": 0.0021588484446207683, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.5505361557006836, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.16474661231040955, "step": 1723 }, { "epoch": 0.025743062140228014, "grad_norm": 0.439453125, "grad_norm_var": 0.002151934305826823, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.5282329320907593, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.2071186602115631, "step": 1724 }, { "epoch": 0.025757994310842997, "grad_norm": 0.6328125, "grad_norm_var": 0.004025522867838542, "learning_rate": 0.0001, "loss": 1.8401, "loss/crossentropy": 3.063611149787903, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.2307065650820732, "step": 1725 }, { "epoch": 0.025772926481457976, "grad_norm": 0.44140625, "grad_norm_var": 0.004050127665201823, "learning_rate": 0.0001, "loss": 1.6793, "loss/crossentropy": 2.620327115058899, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20271999388933182, "step": 1726 }, { "epoch": 0.02578785865207296, "grad_norm": 0.4296875, "grad_norm_var": 0.004021565119425456, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.5267714262008667, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19974888116121292, "step": 1727 }, { "epoch": 0.02580279082268794, "grad_norm": 0.359375, "grad_norm_var": 0.004670206705729167, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 2.359518885612488, "loss/fcd": 1.3125, "loss/idx": 7.0, "loss/logits": 0.16182270646095276, "step": 1728 }, { "epoch": 0.02581772299330292, "grad_norm": 0.5234375, "grad_norm_var": 0.00490563710530599, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.655366063117981, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.19535569846630096, "step": 1729 }, { "epoch": 0.025832655163917903, "grad_norm": 0.4765625, "grad_norm_var": 0.00490563710530599, "learning_rate": 0.0001, "loss": 1.6928, "loss/crossentropy": 2.4279606342315674, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.18886201083660126, "step": 1730 }, { "epoch": 0.025847587334532886, "grad_norm": 0.62109375, "grad_norm_var": 0.006420644124348959, "learning_rate": 0.0001, "loss": 1.7304, "loss/crossentropy": 2.588435649871826, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.19522760808467865, "step": 1731 }, { "epoch": 0.025862519505147865, "grad_norm": 0.423828125, "grad_norm_var": 0.006572834650675456, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.426050543785095, "loss/fcd": 1.34375, "loss/idx": 7.0, "loss/logits": 0.16515402495861053, "step": 1732 }, { "epoch": 0.025877451675762848, "grad_norm": 0.43359375, "grad_norm_var": 0.006372563044230143, "learning_rate": 0.0001, "loss": 1.5332, "loss/crossentropy": 2.7584487199783325, "loss/fcd": 1.35546875, "loss/idx": 7.0, "loss/logits": 0.17777415364980698, "step": 1733 }, { "epoch": 0.02589238384637783, "grad_norm": 0.41796875, "grad_norm_var": 0.006158685684204102, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.219975471496582, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.20416638255119324, "step": 1734 }, { "epoch": 0.02590731601699281, "grad_norm": 0.423828125, "grad_norm_var": 0.00604551633199056, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.7796677350997925, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17880570888519287, "step": 1735 }, { "epoch": 0.025922248187607792, "grad_norm": 0.421875, "grad_norm_var": 0.0058252811431884766, "learning_rate": 0.0001, "loss": 1.6237, "loss/crossentropy": 2.6636239290237427, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.19006356596946716, "step": 1736 }, { "epoch": 0.02593718035822277, "grad_norm": 0.388671875, "grad_norm_var": 0.00600738525390625, "learning_rate": 0.0001, "loss": 1.4821, "loss/crossentropy": 2.5613255500793457, "loss/fcd": 1.32421875, "loss/idx": 7.0, "loss/logits": 0.15791697800159454, "step": 1737 }, { "epoch": 0.025952112528837754, "grad_norm": 0.435546875, "grad_norm_var": 0.006015459696451823, "learning_rate": 0.0001, "loss": 1.5669, "loss/crossentropy": 2.7683653831481934, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.1879609152674675, "step": 1738 }, { "epoch": 0.025967044699452737, "grad_norm": 0.484375, "grad_norm_var": 0.005649169286092122, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.6315919160842896, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18363645672798157, "step": 1739 }, { "epoch": 0.025981976870067716, "grad_norm": 0.453125, "grad_norm_var": 0.005624135335286458, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.885563850402832, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.20242498070001602, "step": 1740 }, { "epoch": 0.0259969090406827, "grad_norm": 0.453125, "grad_norm_var": 0.0035125732421875, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.66191029548645, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.17110195010900497, "step": 1741 }, { "epoch": 0.02601184121129768, "grad_norm": 0.380859375, "grad_norm_var": 0.003804763158162435, "learning_rate": 0.0001, "loss": 1.516, "loss/crossentropy": 2.490989565849304, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.1644165813922882, "step": 1742 }, { "epoch": 0.02602677338191266, "grad_norm": 0.42578125, "grad_norm_var": 0.003813918431599935, "learning_rate": 0.0001, "loss": 1.6632, "loss/crossentropy": 2.6029820442199707, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19833851605653763, "step": 1743 }, { "epoch": 0.026041705552527643, "grad_norm": 0.474609375, "grad_norm_var": 0.003325335184733073, "learning_rate": 0.0001, "loss": 1.6439, "loss/crossentropy": 2.4745417833328247, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.18294240534305573, "step": 1744 }, { "epoch": 0.026056637723142626, "grad_norm": 0.447265625, "grad_norm_var": 0.002966419855753581, "learning_rate": 0.0001, "loss": 1.7305, "loss/crossentropy": 2.6282153129577637, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.1953415721654892, "step": 1745 }, { "epoch": 0.026071569893757605, "grad_norm": 0.7734375, "grad_norm_var": 0.009620014826456707, "learning_rate": 0.0001, "loss": 2.1886, "loss/crossentropy": 2.5288267135620117, "loss/fcd": 1.87890625, "loss/idx": 7.0, "loss/logits": 0.3096487820148468, "step": 1746 }, { "epoch": 0.026086502064372588, "grad_norm": 0.43359375, "grad_norm_var": 0.007944599787394205, "learning_rate": 0.0001, "loss": 1.5432, "loss/crossentropy": 2.5799275636672974, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.1759936362504959, "step": 1747 }, { "epoch": 0.026101434234987567, "grad_norm": 0.455078125, "grad_norm_var": 0.007877969741821289, "learning_rate": 0.0001, "loss": 1.5568, "loss/crossentropy": 2.489788770675659, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.17003405839204788, "step": 1748 }, { "epoch": 0.02611636640560255, "grad_norm": 0.54296875, "grad_norm_var": 0.008292754491170248, "learning_rate": 0.0001, "loss": 1.6785, "loss/crossentropy": 2.467368721961975, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1863422393798828, "step": 1749 }, { "epoch": 0.026131298576217533, "grad_norm": 0.376953125, "grad_norm_var": 0.008645566304524739, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 2.6278090476989746, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.17704641819000244, "step": 1750 }, { "epoch": 0.026146230746832512, "grad_norm": 0.416015625, "grad_norm_var": 0.008687782287597656, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.709711790084839, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.17853021621704102, "step": 1751 }, { "epoch": 0.026161162917447495, "grad_norm": 0.423828125, "grad_norm_var": 0.008678038914998373, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.553797960281372, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1872834861278534, "step": 1752 }, { "epoch": 0.026176095088062477, "grad_norm": 0.421875, "grad_norm_var": 0.008429718017578126, "learning_rate": 0.0001, "loss": 1.7212, "loss/crossentropy": 2.6268469095230103, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.213436096906662, "step": 1753 }, { "epoch": 0.026191027258677457, "grad_norm": 0.4296875, "grad_norm_var": 0.008452844619750977, "learning_rate": 0.0001, "loss": 1.7586, "loss/crossentropy": 2.3185973167419434, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.2156294584274292, "step": 1754 }, { "epoch": 0.02620595942929244, "grad_norm": 0.44921875, "grad_norm_var": 0.008425378799438476, "learning_rate": 0.0001, "loss": 1.7854, "loss/crossentropy": 2.4275808334350586, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.22683294117450714, "step": 1755 }, { "epoch": 0.026220891599907422, "grad_norm": 0.45703125, "grad_norm_var": 0.00842283566792806, "learning_rate": 0.0001, "loss": 1.59, "loss/crossentropy": 2.330631732940674, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.17204724997282028, "step": 1756 }, { "epoch": 0.0262358237705224, "grad_norm": 0.275390625, "grad_norm_var": 0.01056207021077474, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.5538136959075928, "loss/fcd": 1.34375, "loss/idx": 7.25, "loss/logits": 0.15666767954826355, "step": 1757 }, { "epoch": 0.026250755941137384, "grad_norm": 0.2490234375, "grad_norm_var": 0.01284570296605428, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.707225203514099, "loss/fcd": 1.20703125, "loss/idx": 7.5, "loss/logits": 0.17551743984222412, "step": 1758 }, { "epoch": 0.026265688111752363, "grad_norm": 0.353515625, "grad_norm_var": 0.013316182295481364, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.4359453916549683, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.20481543242931366, "step": 1759 }, { "epoch": 0.026280620282367346, "grad_norm": 0.34375, "grad_norm_var": 0.013716598351796469, "learning_rate": 0.0001, "loss": 1.7616, "loss/crossentropy": 2.471827507019043, "loss/fcd": 1.515625, "loss/idx": 7.5, "loss/logits": 0.24594175815582275, "step": 1760 }, { "epoch": 0.02629555245298233, "grad_norm": 0.28515625, "grad_norm_var": 0.01494350035985311, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.588927745819092, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.15835162997245789, "step": 1761 }, { "epoch": 0.026310484623597308, "grad_norm": 0.294921875, "grad_norm_var": 0.006571034590403239, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.506476879119873, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.19668720662593842, "step": 1762 }, { "epoch": 0.02632541679421229, "grad_norm": 0.31640625, "grad_norm_var": 0.0067169467608133955, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.1519944071769714, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.2064507007598877, "step": 1763 }, { "epoch": 0.026340348964827273, "grad_norm": 0.322265625, "grad_norm_var": 0.0065018614133199055, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.115912914276123, "loss/fcd": 1.40625, "loss/idx": 7.5, "loss/logits": 0.19682101160287857, "step": 1764 }, { "epoch": 0.026355281135442252, "grad_norm": 0.294921875, "grad_norm_var": 0.004705297946929932, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.650931715965271, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.16712762415409088, "step": 1765 }, { "epoch": 0.026370213306057235, "grad_norm": 0.546875, "grad_norm_var": 0.00696483850479126, "learning_rate": 0.0001, "loss": 1.827, "loss/crossentropy": 2.122111737728119, "loss/fcd": 1.6015625, "loss/idx": 7.5, "loss/logits": 0.2254578173160553, "step": 1766 }, { "epoch": 0.026385145476672218, "grad_norm": 0.267578125, "grad_norm_var": 0.007381594181060791, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.469594359397888, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.18926545977592468, "step": 1767 }, { "epoch": 0.026400077647287197, "grad_norm": 0.263671875, "grad_norm_var": 0.007583614190419515, "learning_rate": 0.0001, "loss": 1.3253, "loss/crossentropy": 2.4640984535217285, "loss/fcd": 1.171875, "loss/idx": 7.5, "loss/logits": 0.15345098823308945, "step": 1768 }, { "epoch": 0.02641500981790218, "grad_norm": 0.6484375, "grad_norm_var": 0.013017205397288005, "learning_rate": 0.0001, "loss": 1.9709, "loss/crossentropy": 2.5803475379943848, "loss/fcd": 1.71875, "loss/idx": 7.5, "loss/logits": 0.2521095424890518, "step": 1769 }, { "epoch": 0.026429941988517162, "grad_norm": 0.306640625, "grad_norm_var": 0.012858990828196208, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.5042024850845337, "loss/fcd": 1.33984375, "loss/idx": 7.5, "loss/logits": 0.20056728273630142, "step": 1770 }, { "epoch": 0.02644487415913214, "grad_norm": 0.30078125, "grad_norm_var": 0.01236492395401001, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.4148744344711304, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1605551317334175, "step": 1771 }, { "epoch": 0.026459806329747124, "grad_norm": 0.26953125, "grad_norm_var": 0.01177135705947876, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.6074177026748657, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16225765645503998, "step": 1772 }, { "epoch": 0.026474738500362104, "grad_norm": 0.291015625, "grad_norm_var": 0.011665181318918864, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.6354998350143433, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1736658662557602, "step": 1773 }, { "epoch": 0.026489670670977086, "grad_norm": 0.314453125, "grad_norm_var": 0.011185693740844726, "learning_rate": 0.0001, "loss": 1.4989, "loss/crossentropy": 2.5636308193206787, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.1903422474861145, "step": 1774 }, { "epoch": 0.02650460284159207, "grad_norm": 0.296875, "grad_norm_var": 0.01127465565999349, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.777296304702759, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.18906784802675247, "step": 1775 }, { "epoch": 0.026519535012207048, "grad_norm": 0.318359375, "grad_norm_var": 0.011286020278930664, "learning_rate": 0.0001, "loss": 1.6954, "loss/crossentropy": 2.39978289604187, "loss/fcd": 1.43359375, "loss/idx": 7.5, "loss/logits": 0.26185186207294464, "step": 1776 }, { "epoch": 0.02653446718282203, "grad_norm": 0.306640625, "grad_norm_var": 0.011176045735677083, "learning_rate": 0.0001, "loss": 1.4712, "loss/crossentropy": 2.3673434257507324, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.17040134966373444, "step": 1777 }, { "epoch": 0.026549399353437014, "grad_norm": 0.32421875, "grad_norm_var": 0.011073287328084309, "learning_rate": 0.0001, "loss": 1.5825, "loss/crossentropy": 2.438999891281128, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.1957414448261261, "step": 1778 }, { "epoch": 0.026564331524051993, "grad_norm": 0.318359375, "grad_norm_var": 0.011068216959635417, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.551382541656494, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.1737765148282051, "step": 1779 }, { "epoch": 0.026579263694666976, "grad_norm": 0.349609375, "grad_norm_var": 0.011061541239420573, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.7158048152923584, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.19656657427549362, "step": 1780 }, { "epoch": 0.026594195865281958, "grad_norm": 1.3359375, "grad_norm_var": 0.07272782325744628, "learning_rate": 0.0001, "loss": 2.2707, "loss/crossentropy": 2.624470829963684, "loss/fcd": 1.8203125, "loss/idx": 7.5, "loss/logits": 0.4503566026687622, "step": 1781 }, { "epoch": 0.026609128035896937, "grad_norm": 0.359375, "grad_norm_var": 0.07134537696838379, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.7444372177124023, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.209727481007576, "step": 1782 }, { "epoch": 0.02662406020651192, "grad_norm": 0.310546875, "grad_norm_var": 0.07074812253316244, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.5853861570358276, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19111276417970657, "step": 1783 }, { "epoch": 0.0266389923771269, "grad_norm": 0.314453125, "grad_norm_var": 0.07002243995666504, "learning_rate": 0.0001, "loss": 1.6429, "loss/crossentropy": 2.1288896799087524, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.18192436546087265, "step": 1784 }, { "epoch": 0.026653924547741882, "grad_norm": 0.26171875, "grad_norm_var": 0.06644730567932129, "learning_rate": 0.0001, "loss": 1.3741, "loss/crossentropy": 2.598435163497925, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.15930332243442535, "step": 1785 }, { "epoch": 0.026668856718356865, "grad_norm": 0.73046875, "grad_norm_var": 0.0738870620727539, "learning_rate": 0.0001, "loss": 1.798, "loss/crossentropy": 2.725283145904541, "loss/fcd": 1.51171875, "loss/idx": 7.5, "loss/logits": 0.2863228842616081, "step": 1786 }, { "epoch": 0.026683788888971844, "grad_norm": 0.287109375, "grad_norm_var": 0.07407987912495931, "learning_rate": 0.0001, "loss": 1.4838, "loss/crossentropy": 2.539591908454895, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.18304357677698135, "step": 1787 }, { "epoch": 0.026698721059586827, "grad_norm": 0.287109375, "grad_norm_var": 0.07379506429036459, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.545486330986023, "loss/fcd": 1.2734375, "loss/idx": 7.5, "loss/logits": 0.17317884415388107, "step": 1788 }, { "epoch": 0.02671365323020181, "grad_norm": 0.322265625, "grad_norm_var": 0.07340037027994792, "learning_rate": 0.0001, "loss": 1.5762, "loss/crossentropy": 2.6705424785614014, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.1934109628200531, "step": 1789 }, { "epoch": 0.02672858540081679, "grad_norm": 0.365234375, "grad_norm_var": 0.07296644846598307, "learning_rate": 0.0001, "loss": 1.6593, "loss/crossentropy": 2.9419108629226685, "loss/fcd": 1.4296875, "loss/idx": 7.5, "loss/logits": 0.22962473332881927, "step": 1790 }, { "epoch": 0.02674351757143177, "grad_norm": 0.306640625, "grad_norm_var": 0.07283094724019369, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.606139898300171, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.16449995338916779, "step": 1791 }, { "epoch": 0.026758449742046754, "grad_norm": 0.322265625, "grad_norm_var": 0.07278618812561036, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.622813105583191, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20424649119377136, "step": 1792 }, { "epoch": 0.026773381912661733, "grad_norm": 0.3359375, "grad_norm_var": 0.07245025634765626, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.721924066543579, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.20560920238494873, "step": 1793 }, { "epoch": 0.026788314083276716, "grad_norm": 0.306640625, "grad_norm_var": 0.07266640663146973, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.4652721881866455, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.22881120443344116, "step": 1794 }, { "epoch": 0.0268032462538917, "grad_norm": 0.47265625, "grad_norm_var": 0.07232863108317057, "learning_rate": 0.0001, "loss": 1.8245, "loss/crossentropy": 2.977765440940857, "loss/fcd": 1.5546875, "loss/idx": 7.5, "loss/logits": 0.26981621235609055, "step": 1795 }, { "epoch": 0.026818178424506678, "grad_norm": 0.2734375, "grad_norm_var": 0.07337314287821452, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.5598191022872925, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.1760355606675148, "step": 1796 }, { "epoch": 0.02683311059512166, "grad_norm": 0.3125, "grad_norm_var": 0.01275645891825358, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.583548426628113, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.19874022156000137, "step": 1797 }, { "epoch": 0.02684804276573664, "grad_norm": 0.25390625, "grad_norm_var": 0.013292042414347331, "learning_rate": 0.0001, "loss": 1.3822, "loss/crossentropy": 2.473318338394165, "loss/fcd": 1.22265625, "loss/idx": 7.5, "loss/logits": 0.15951504558324814, "step": 1798 }, { "epoch": 0.026862974936351623, "grad_norm": 0.31640625, "grad_norm_var": 0.013270060221354166, "learning_rate": 0.0001, "loss": 1.6083, "loss/crossentropy": 2.5721927881240845, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.2098878100514412, "step": 1799 }, { "epoch": 0.026877907106966605, "grad_norm": 0.33984375, "grad_norm_var": 0.013217782974243164, "learning_rate": 0.0001, "loss": 1.5284, "loss/crossentropy": 2.5014588832855225, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.1963837966322899, "step": 1800 }, { "epoch": 0.026892839277581584, "grad_norm": 0.3046875, "grad_norm_var": 0.012865304946899414, "learning_rate": 0.0001, "loss": 1.5965, "loss/crossentropy": 2.6525572538375854, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.22152644395828247, "step": 1801 }, { "epoch": 0.026907771448196567, "grad_norm": 0.27734375, "grad_norm_var": 0.002473815282185872, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.556833505630493, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.207870252430439, "step": 1802 }, { "epoch": 0.02692270361881155, "grad_norm": 0.302734375, "grad_norm_var": 0.0024252414703369142, "learning_rate": 0.0001, "loss": 1.4564, "loss/crossentropy": 2.721066951751709, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.17513567954301834, "step": 1803 }, { "epoch": 0.02693763578942653, "grad_norm": 0.294921875, "grad_norm_var": 0.0023961226145426434, "learning_rate": 0.0001, "loss": 1.5316, "loss/crossentropy": 2.4543330669403076, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.1878231167793274, "step": 1804 }, { "epoch": 0.026952567960041512, "grad_norm": 0.55078125, "grad_norm_var": 0.0057528177897135414, "learning_rate": 0.0001, "loss": 2.158, "loss/crossentropy": 2.319103717803955, "loss/fcd": 1.8203125, "loss/idx": 7.5, "loss/logits": 0.3376483768224716, "step": 1805 }, { "epoch": 0.026967500130656494, "grad_norm": 0.265625, "grad_norm_var": 0.005951420466105143, "learning_rate": 0.0001, "loss": 1.3878, "loss/crossentropy": 2.5267797708511353, "loss/fcd": 1.21875, "loss/idx": 7.5, "loss/logits": 0.16907892376184464, "step": 1806 }, { "epoch": 0.026982432301271474, "grad_norm": 0.353515625, "grad_norm_var": 0.005959812800089518, "learning_rate": 0.0001, "loss": 1.6583, "loss/crossentropy": 2.6328471899032593, "loss/fcd": 1.41796875, "loss/idx": 7.5, "loss/logits": 0.24038030207157135, "step": 1807 }, { "epoch": 0.026997364471886456, "grad_norm": 0.314453125, "grad_norm_var": 0.005971892674763998, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.696038246154785, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.17528624087572098, "step": 1808 }, { "epoch": 0.027012296642501436, "grad_norm": 0.337890625, "grad_norm_var": 0.00597375233968099, "learning_rate": 0.0001, "loss": 1.517, "loss/crossentropy": 2.461454749107361, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.19280918687582016, "step": 1809 }, { "epoch": 0.02702722881311642, "grad_norm": 0.271484375, "grad_norm_var": 0.006159718831380208, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.67915940284729, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.17423474788665771, "step": 1810 }, { "epoch": 0.0270421609837314, "grad_norm": 0.30859375, "grad_norm_var": 0.004669698079427084, "learning_rate": 0.0001, "loss": 1.3907, "loss/crossentropy": 2.724575161933899, "loss/fcd": 1.22265625, "loss/idx": 7.5, "loss/logits": 0.16803188621997833, "step": 1811 }, { "epoch": 0.02705709315434638, "grad_norm": 0.4296875, "grad_norm_var": 0.005280049641927084, "learning_rate": 0.0001, "loss": 1.9115, "loss/crossentropy": 2.6169649362564087, "loss/fcd": 1.625, "loss/idx": 7.5, "loss/logits": 0.28652864694595337, "step": 1812 }, { "epoch": 0.027072025324961363, "grad_norm": 0.296875, "grad_norm_var": 0.005325826009114584, "learning_rate": 0.0001, "loss": 1.5192, "loss/crossentropy": 2.7209300994873047, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.19105875492095947, "step": 1813 }, { "epoch": 0.027086957495576346, "grad_norm": 0.9140625, "grad_norm_var": 0.026202837626139324, "learning_rate": 0.0001, "loss": 1.7825, "loss/crossentropy": 2.6711992025375366, "loss/fcd": 1.5078125, "loss/idx": 7.5, "loss/logits": 0.2746375799179077, "step": 1814 }, { "epoch": 0.027101889666191325, "grad_norm": 0.357421875, "grad_norm_var": 0.026028935114542642, "learning_rate": 0.0001, "loss": 1.6117, "loss/crossentropy": 2.5210726261138916, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.21324608474969864, "step": 1815 }, { "epoch": 0.027116821836806308, "grad_norm": 0.3828125, "grad_norm_var": 0.02597158749898275, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.500070095062256, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.1906418353319168, "step": 1816 }, { "epoch": 0.02713175400742129, "grad_norm": 0.302734375, "grad_norm_var": 0.025989532470703125, "learning_rate": 0.0001, "loss": 1.5851, "loss/crossentropy": 2.55086088180542, "loss/fcd": 1.39453125, "loss/idx": 7.5, "loss/logits": 0.19061411917209625, "step": 1817 }, { "epoch": 0.02714668617803627, "grad_norm": 0.31640625, "grad_norm_var": 0.0255889892578125, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.4247604608535767, "loss/fcd": 1.390625, "loss/idx": 7.5, "loss/logits": 0.21899522095918655, "step": 1818 }, { "epoch": 0.027161618348651252, "grad_norm": 0.2734375, "grad_norm_var": 0.0259249210357666, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.612613081932068, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.16881398856639862, "step": 1819 }, { "epoch": 0.02717655051926623, "grad_norm": 0.291015625, "grad_norm_var": 0.025966628392537435, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.3704020977020264, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.166450597345829, "step": 1820 }, { "epoch": 0.027191482689881214, "grad_norm": 0.365234375, "grad_norm_var": 0.02371826171875, "learning_rate": 0.0001, "loss": 1.5727, "loss/crossentropy": 2.353190541267395, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.1898830384016037, "step": 1821 }, { "epoch": 0.027206414860496197, "grad_norm": 0.283203125, "grad_norm_var": 0.023513269424438477, "learning_rate": 0.0001, "loss": 1.4686, "loss/crossentropy": 2.5452089309692383, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.19126763939857483, "step": 1822 }, { "epoch": 0.027221347031111176, "grad_norm": 1.3671875, "grad_norm_var": 0.086529541015625, "learning_rate": 0.0001, "loss": 1.8668, "loss/crossentropy": 3.4751185178756714, "loss/fcd": 1.6484375, "loss/idx": 7.5, "loss/logits": 0.21837294846773148, "step": 1823 }, { "epoch": 0.02723627920172616, "grad_norm": 0.30859375, "grad_norm_var": 0.08661866188049316, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.515225887298584, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.1766873598098755, "step": 1824 }, { "epoch": 0.02725121137234114, "grad_norm": 0.296875, "grad_norm_var": 0.08720245361328124, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.6789597272872925, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16845793277025223, "step": 1825 }, { "epoch": 0.02726614354295612, "grad_norm": 0.333984375, "grad_norm_var": 0.08618520100911459, "learning_rate": 0.0001, "loss": 1.3844, "loss/crossentropy": 2.8141207695007324, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.1539573296904564, "step": 1826 }, { "epoch": 0.027281075713571103, "grad_norm": 0.357421875, "grad_norm_var": 0.08556491533915202, "learning_rate": 0.0001, "loss": 1.6948, "loss/crossentropy": 2.623816967010498, "loss/fcd": 1.47265625, "loss/idx": 7.5, "loss/logits": 0.22218744456768036, "step": 1827 }, { "epoch": 0.027296007884186086, "grad_norm": 0.306640625, "grad_norm_var": 0.08651320139567058, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.72926127910614, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.17186040431261063, "step": 1828 }, { "epoch": 0.027310940054801065, "grad_norm": 0.263671875, "grad_norm_var": 0.08713657061258952, "learning_rate": 0.0001, "loss": 1.4489, "loss/crossentropy": 2.571180582046509, "loss/fcd": 1.26953125, "loss/idx": 7.5, "loss/logits": 0.17934715747833252, "step": 1829 }, { "epoch": 0.027325872225416048, "grad_norm": 0.294921875, "grad_norm_var": 0.07031275431315104, "learning_rate": 0.0001, "loss": 1.5624, "loss/crossentropy": 2.6993069648742676, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.20297697931528091, "step": 1830 }, { "epoch": 0.02734080439603103, "grad_norm": 0.65234375, "grad_norm_var": 0.07480810483296713, "learning_rate": 0.0001, "loss": 2.2399, "loss/crossentropy": 2.70441734790802, "loss/fcd": 1.86328125, "loss/idx": 7.5, "loss/logits": 0.3765818625688553, "step": 1831 }, { "epoch": 0.02735573656664601, "grad_norm": 0.294921875, "grad_norm_var": 0.0754897435506185, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.6646634340286255, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.18530730158090591, "step": 1832 }, { "epoch": 0.027370668737260993, "grad_norm": 0.29296875, "grad_norm_var": 0.07561491330464681, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.571444511413574, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.17866399139165878, "step": 1833 }, { "epoch": 0.027385600907875972, "grad_norm": 0.31640625, "grad_norm_var": 0.07561491330464681, "learning_rate": 0.0001, "loss": 1.6586, "loss/crossentropy": 2.553916811943054, "loss/fcd": 1.46484375, "loss/idx": 7.5, "loss/logits": 0.19379810988903046, "step": 1834 }, { "epoch": 0.027400533078490955, "grad_norm": 0.29296875, "grad_norm_var": 0.07532563209533691, "learning_rate": 0.0001, "loss": 1.4102, "loss/crossentropy": 2.4229013919830322, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1679960861802101, "step": 1835 }, { "epoch": 0.027415465249105937, "grad_norm": 0.328125, "grad_norm_var": 0.07489770253499349, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.3115060329437256, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.17078139632940292, "step": 1836 }, { "epoch": 0.027430397419720916, "grad_norm": 0.279296875, "grad_norm_var": 0.07572574615478515, "learning_rate": 0.0001, "loss": 1.517, "loss/crossentropy": 2.46151602268219, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.19276980310678482, "step": 1837 }, { "epoch": 0.0274453295903359, "grad_norm": 0.271484375, "grad_norm_var": 0.07590408325195312, "learning_rate": 0.0001, "loss": 1.4096, "loss/crossentropy": 2.487010359764099, "loss/fcd": 1.24609375, "loss/idx": 7.5, "loss/logits": 0.16347003728151321, "step": 1838 }, { "epoch": 0.027460261760950882, "grad_norm": 0.310546875, "grad_norm_var": 0.008170048395792643, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 2.558194160461426, "loss/fcd": 1.26953125, "loss/idx": 7.5, "loss/logits": 0.1762079894542694, "step": 1839 }, { "epoch": 0.02747519393156586, "grad_norm": 0.283203125, "grad_norm_var": 0.008266131083170572, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.67186176776886, "loss/fcd": 1.37109375, "loss/idx": 7.5, "loss/logits": 0.20449146628379822, "step": 1840 }, { "epoch": 0.027490126102180844, "grad_norm": 0.302734375, "grad_norm_var": 0.00824748675028483, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.7830512523651123, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.20284771919250488, "step": 1841 }, { "epoch": 0.027505058272795826, "grad_norm": 0.251953125, "grad_norm_var": 0.00855724016825358, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.5222907066345215, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16962562501430511, "step": 1842 }, { "epoch": 0.027519990443410806, "grad_norm": 0.328125, "grad_norm_var": 0.008459726969401041, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 2.350256323814392, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.21821796149015427, "step": 1843 }, { "epoch": 0.02753492261402579, "grad_norm": 0.359375, "grad_norm_var": 0.008561436335245769, "learning_rate": 0.0001, "loss": 1.7676, "loss/crossentropy": 2.657059669494629, "loss/fcd": 1.53125, "loss/idx": 7.5, "loss/logits": 0.23639147728681564, "step": 1844 }, { "epoch": 0.027549854784640768, "grad_norm": 0.330078125, "grad_norm_var": 0.008336623509724935, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.825732111930847, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.20341219007968903, "step": 1845 }, { "epoch": 0.02756478695525575, "grad_norm": 0.369140625, "grad_norm_var": 0.008389774958292644, "learning_rate": 0.0001, "loss": 1.7813, "loss/crossentropy": 2.3357163667678833, "loss/fcd": 1.546875, "loss/idx": 7.5, "loss/logits": 0.2344117909669876, "step": 1846 }, { "epoch": 0.027579719125870733, "grad_norm": 0.326171875, "grad_norm_var": 0.0009760538736979167, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.734041929244995, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.2180958315730095, "step": 1847 }, { "epoch": 0.027594651296485712, "grad_norm": 0.3046875, "grad_norm_var": 0.000964212417602539, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.763646125793457, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.16384299844503403, "step": 1848 }, { "epoch": 0.027609583467100695, "grad_norm": 0.267578125, "grad_norm_var": 0.0010594685872395834, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.650328516960144, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.2092689573764801, "step": 1849 }, { "epoch": 0.027624515637715678, "grad_norm": 0.306640625, "grad_norm_var": 0.0010539849599202475, "learning_rate": 0.0001, "loss": 1.6693, "loss/crossentropy": 2.6345717906951904, "loss/fcd": 1.421875, "loss/idx": 7.5, "loss/logits": 0.24741078913211823, "step": 1850 }, { "epoch": 0.027639447808330657, "grad_norm": 0.271484375, "grad_norm_var": 0.001123046875, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.402801752090454, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.15703191608190536, "step": 1851 }, { "epoch": 0.02765437997894564, "grad_norm": 0.296875, "grad_norm_var": 0.0010904947916666667, "learning_rate": 0.0001, "loss": 1.5771, "loss/crossentropy": 2.4666026830673218, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.21776925027370453, "step": 1852 }, { "epoch": 0.027669312149560622, "grad_norm": 0.322265625, "grad_norm_var": 0.0010660171508789062, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.6620728969573975, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.21284056454896927, "step": 1853 }, { "epoch": 0.0276842443201756, "grad_norm": 0.29296875, "grad_norm_var": 0.00099485715230306, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.656482458114624, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19264397770166397, "step": 1854 }, { "epoch": 0.027699176490790584, "grad_norm": 0.30859375, "grad_norm_var": 0.0009943644205729167, "learning_rate": 0.0001, "loss": 1.3658, "loss/crossentropy": 2.710697650909424, "loss/fcd": 1.19921875, "loss/idx": 7.5, "loss/logits": 0.16661225259304047, "step": 1855 }, { "epoch": 0.027714108661405567, "grad_norm": 0.30078125, "grad_norm_var": 0.0009564558664957682, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.712221622467041, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.1944480687379837, "step": 1856 }, { "epoch": 0.027729040832020546, "grad_norm": 0.4140625, "grad_norm_var": 0.0016422907511393229, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.5411465167999268, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.2180405557155609, "step": 1857 }, { "epoch": 0.02774397300263553, "grad_norm": 0.263671875, "grad_norm_var": 0.0015513102213541667, "learning_rate": 0.0001, "loss": 1.4363, "loss/crossentropy": 2.5804349184036255, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17066682875156403, "step": 1858 }, { "epoch": 0.027758905173250508, "grad_norm": 0.298828125, "grad_norm_var": 0.0015591780344645183, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.710708737373352, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.17788437753915787, "step": 1859 }, { "epoch": 0.02777383734386549, "grad_norm": 0.369140625, "grad_norm_var": 0.0016234715779622395, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.003714084625244, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.16729778051376343, "step": 1860 }, { "epoch": 0.027788769514480473, "grad_norm": 0.302734375, "grad_norm_var": 0.00161590576171875, "learning_rate": 0.0001, "loss": 1.577, "loss/crossentropy": 2.538116216659546, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.2019786238670349, "step": 1861 }, { "epoch": 0.027803701685095453, "grad_norm": 0.287109375, "grad_norm_var": 0.0014276504516601562, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.833705425262451, "loss/fcd": 1.29296875, "loss/idx": 7.5, "loss/logits": 0.19186238199472427, "step": 1862 }, { "epoch": 0.027818633855710435, "grad_norm": 0.32421875, "grad_norm_var": 0.0014232476552327475, "learning_rate": 0.0001, "loss": 1.6606, "loss/crossentropy": 2.7050453424453735, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.21526382863521576, "step": 1863 }, { "epoch": 0.027833566026325418, "grad_norm": 0.283203125, "grad_norm_var": 0.001462237040201823, "learning_rate": 0.0001, "loss": 1.4428, "loss/crossentropy": 2.618694305419922, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17713554948568344, "step": 1864 }, { "epoch": 0.027848498196940397, "grad_norm": 0.298828125, "grad_norm_var": 0.0013594945271809896, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.6274200677871704, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.16798733174800873, "step": 1865 }, { "epoch": 0.02786343036755538, "grad_norm": 0.3359375, "grad_norm_var": 0.0014045556386311849, "learning_rate": 0.0001, "loss": 1.6525, "loss/crossentropy": 2.531873106956482, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.2267523929476738, "step": 1866 }, { "epoch": 0.027878362538170363, "grad_norm": 0.294921875, "grad_norm_var": 0.0013164361317952475, "learning_rate": 0.0001, "loss": 1.6755, "loss/crossentropy": 2.2504754066467285, "loss/fcd": 1.45703125, "loss/idx": 7.5, "loss/logits": 0.21843808144330978, "step": 1867 }, { "epoch": 0.027893294708785342, "grad_norm": 0.310546875, "grad_norm_var": 0.0013003031412760417, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.297727584838867, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.18825874477624893, "step": 1868 }, { "epoch": 0.027908226879400325, "grad_norm": 0.265625, "grad_norm_var": 0.0014307498931884766, "learning_rate": 0.0001, "loss": 1.3449, "loss/crossentropy": 2.520071268081665, "loss/fcd": 1.19140625, "loss/idx": 7.5, "loss/logits": 0.15346477925777435, "step": 1869 }, { "epoch": 0.027923159050015304, "grad_norm": 0.265625, "grad_norm_var": 0.0015375614166259766, "learning_rate": 0.0001, "loss": 1.4415, "loss/crossentropy": 2.5880095958709717, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17981453239917755, "step": 1870 }, { "epoch": 0.027938091220630287, "grad_norm": 0.28125, "grad_norm_var": 0.0015811761220296224, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.5557409524917603, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.210035502910614, "step": 1871 }, { "epoch": 0.02795302339124527, "grad_norm": 0.27734375, "grad_norm_var": 0.001631911595662435, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.4933377504348755, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.14611776173114777, "step": 1872 }, { "epoch": 0.02796795556186025, "grad_norm": 0.427734375, "grad_norm_var": 0.0018431981404622396, "learning_rate": 0.0001, "loss": 1.6354, "loss/crossentropy": 2.6224533319473267, "loss/fcd": 1.421875, "loss/idx": 7.5, "loss/logits": 0.21356236189603806, "step": 1873 }, { "epoch": 0.02798288773247523, "grad_norm": 0.2734375, "grad_norm_var": 0.0017947991689046224, "learning_rate": 0.0001, "loss": 1.5447, "loss/crossentropy": 2.4397172927856445, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.2009110450744629, "step": 1874 }, { "epoch": 0.027997819903090214, "grad_norm": 0.30859375, "grad_norm_var": 0.0017913818359375, "learning_rate": 0.0001, "loss": 1.5605, "loss/crossentropy": 2.469580054283142, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.19724100083112717, "step": 1875 }, { "epoch": 0.028012752073705193, "grad_norm": 0.302734375, "grad_norm_var": 0.0015136082967122397, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.7592979669570923, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.20207630097866058, "step": 1876 }, { "epoch": 0.028027684244320176, "grad_norm": 0.2578125, "grad_norm_var": 0.0016382694244384765, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.464821934700012, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.18653883039951324, "step": 1877 }, { "epoch": 0.02804261641493516, "grad_norm": 0.330078125, "grad_norm_var": 0.0016816298166910807, "learning_rate": 0.0001, "loss": 1.7049, "loss/crossentropy": 2.4362906217575073, "loss/fcd": 1.47265625, "loss/idx": 7.5, "loss/logits": 0.23222223669290543, "step": 1878 }, { "epoch": 0.028057548585550138, "grad_norm": 0.2734375, "grad_norm_var": 0.0016948541005452475, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.5615785121917725, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.1710590422153473, "step": 1879 }, { "epoch": 0.02807248075616512, "grad_norm": 0.275390625, "grad_norm_var": 0.0017153263092041016, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.6645255088806152, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.17376437038183212, "step": 1880 }, { "epoch": 0.0280874129267801, "grad_norm": 0.298828125, "grad_norm_var": 0.0017153263092041016, "learning_rate": 0.0001, "loss": 1.5619, "loss/crossentropy": 2.543599843978882, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.1986057609319687, "step": 1881 }, { "epoch": 0.028102345097395082, "grad_norm": 0.294921875, "grad_norm_var": 0.0016168594360351563, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.642424702644348, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19652438908815384, "step": 1882 }, { "epoch": 0.028117277268010065, "grad_norm": 0.318359375, "grad_norm_var": 0.0016473770141601563, "learning_rate": 0.0001, "loss": 1.5587, "loss/crossentropy": 2.63770067691803, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.19933994114398956, "step": 1883 }, { "epoch": 0.028132209438625044, "grad_norm": 0.310546875, "grad_norm_var": 0.0016473770141601563, "learning_rate": 0.0001, "loss": 1.5717, "loss/crossentropy": 2.5321085453033447, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.18496868759393692, "step": 1884 }, { "epoch": 0.028147141609240027, "grad_norm": 0.283203125, "grad_norm_var": 0.0015917301177978515, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.5472614765167236, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.17972340434789658, "step": 1885 }, { "epoch": 0.02816207377985501, "grad_norm": 0.30859375, "grad_norm_var": 0.0015175978342692057, "learning_rate": 0.0001, "loss": 1.6146, "loss/crossentropy": 2.5076574087142944, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.2317841500043869, "step": 1886 }, { "epoch": 0.02817700595046999, "grad_norm": 0.322265625, "grad_norm_var": 0.0015125910441080729, "learning_rate": 0.0001, "loss": 1.6261, "loss/crossentropy": 2.5463353395462036, "loss/fcd": 1.40625, "loss/idx": 7.5, "loss/logits": 0.21981056034564972, "step": 1887 }, { "epoch": 0.02819193812108497, "grad_norm": 0.314453125, "grad_norm_var": 0.001466989517211914, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.4339152574539185, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.19599353522062302, "step": 1888 }, { "epoch": 0.028206870291699954, "grad_norm": 0.373046875, "grad_norm_var": 0.0007682641347249349, "learning_rate": 0.0001, "loss": 1.6727, "loss/crossentropy": 2.5966817140579224, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.24687369167804718, "step": 1889 }, { "epoch": 0.028221802462314934, "grad_norm": 0.392578125, "grad_norm_var": 0.0011880874633789062, "learning_rate": 0.0001, "loss": 1.503, "loss/crossentropy": 2.6890329122543335, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.1788247525691986, "step": 1890 }, { "epoch": 0.028236734632929916, "grad_norm": 0.259765625, "grad_norm_var": 0.0013482252756754558, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.4243786334991455, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.16975828260183334, "step": 1891 }, { "epoch": 0.0282516668035449, "grad_norm": 0.30078125, "grad_norm_var": 0.001349639892578125, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.6386592388153076, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.1942928582429886, "step": 1892 }, { "epoch": 0.028266598974159878, "grad_norm": 0.326171875, "grad_norm_var": 0.0011922041575113931, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.382668614387512, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.21862833201885223, "step": 1893 }, { "epoch": 0.02828153114477486, "grad_norm": 0.291015625, "grad_norm_var": 0.0011902968088785806, "learning_rate": 0.0001, "loss": 1.4194, "loss/crossentropy": 2.457250952720642, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.16157541424036026, "step": 1894 }, { "epoch": 0.02829646331538984, "grad_norm": 0.251953125, "grad_norm_var": 0.0013209025065104166, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.478179931640625, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16465666145086288, "step": 1895 }, { "epoch": 0.028311395486004823, "grad_norm": 0.2412109375, "grad_norm_var": 0.0015407840410868326, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.4185166358947754, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.2049560472369194, "step": 1896 }, { "epoch": 0.028326327656619806, "grad_norm": 0.322265625, "grad_norm_var": 0.0015543262163798014, "learning_rate": 0.0001, "loss": 1.5515, "loss/crossentropy": 2.4423259496688843, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1999405100941658, "step": 1897 }, { "epoch": 0.028341259827234785, "grad_norm": 0.291015625, "grad_norm_var": 0.0015615423520406087, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.385899782180786, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.20146413892507553, "step": 1898 }, { "epoch": 0.028356191997849767, "grad_norm": 0.296875, "grad_norm_var": 0.0015569965044657389, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.6253907680511475, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17485372722148895, "step": 1899 }, { "epoch": 0.02837112416846475, "grad_norm": 0.314453125, "grad_norm_var": 0.001560652256011963, "learning_rate": 0.0001, "loss": 1.7989, "loss/crossentropy": 2.2645241022109985, "loss/fcd": 1.5234375, "loss/idx": 7.5, "loss/logits": 0.275456503033638, "step": 1900 }, { "epoch": 0.02838605633907973, "grad_norm": 0.2734375, "grad_norm_var": 0.0015957792599995931, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.492580533027649, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.160808227956295, "step": 1901 }, { "epoch": 0.028400988509694712, "grad_norm": 0.28125, "grad_norm_var": 0.0016293803850809733, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.5176279544830322, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.1665561944246292, "step": 1902 }, { "epoch": 0.028415920680309695, "grad_norm": 0.30078125, "grad_norm_var": 0.001603853702545166, "learning_rate": 0.0001, "loss": 1.4826, "loss/crossentropy": 2.77282452583313, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.18571852147579193, "step": 1903 }, { "epoch": 0.028430852850924674, "grad_norm": 0.2265625, "grad_norm_var": 0.0019400238990783692, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.3292288780212402, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.15895090252161026, "step": 1904 }, { "epoch": 0.028445785021539657, "grad_norm": 0.255859375, "grad_norm_var": 0.0016014695167541504, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.572773575782776, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.16017340123653412, "step": 1905 }, { "epoch": 0.028460717192154636, "grad_norm": 0.29296875, "grad_norm_var": 0.0008475899696350098, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.3504263162612915, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16164566576480865, "step": 1906 }, { "epoch": 0.02847564936276962, "grad_norm": 0.318359375, "grad_norm_var": 0.0008814454078674317, "learning_rate": 0.0001, "loss": 1.6815, "loss/crossentropy": 2.7879390716552734, "loss/fcd": 1.4375, "loss/idx": 7.5, "loss/logits": 0.2440466284751892, "step": 1907 }, { "epoch": 0.0284905815333846, "grad_norm": 0.515625, "grad_norm_var": 0.004173688093821208, "learning_rate": 0.0001, "loss": 1.8784, "loss/crossentropy": 2.7805620431900024, "loss/fcd": 1.53125, "loss/idx": 7.5, "loss/logits": 0.34715893864631653, "step": 1908 }, { "epoch": 0.02850551370399958, "grad_norm": 0.275390625, "grad_norm_var": 0.004157570997873942, "learning_rate": 0.0001, "loss": 1.5323, "loss/crossentropy": 2.494640588760376, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.20422004163265228, "step": 1909 }, { "epoch": 0.028520445874614563, "grad_norm": 0.376953125, "grad_norm_var": 0.004552710056304932, "learning_rate": 0.0001, "loss": 1.602, "loss/crossentropy": 2.690505266189575, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.20359589159488678, "step": 1910 }, { "epoch": 0.028535378045229546, "grad_norm": 0.34765625, "grad_norm_var": 0.0044841726620992025, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.656151294708252, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.17448563873767853, "step": 1911 }, { "epoch": 0.028550310215844525, "grad_norm": 0.302734375, "grad_norm_var": 0.004171498616536458, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.5565932989120483, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.19017393887043, "step": 1912 }, { "epoch": 0.028565242386459508, "grad_norm": 0.2890625, "grad_norm_var": 0.004195006688435873, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.700557827949524, "loss/fcd": 1.17578125, "loss/idx": 7.5, "loss/logits": 0.14714757353067398, "step": 1913 }, { "epoch": 0.02858017455707449, "grad_norm": 0.3359375, "grad_norm_var": 0.004207801818847656, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.620209574699402, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.18275465071201324, "step": 1914 }, { "epoch": 0.02859510672768947, "grad_norm": 0.435546875, "grad_norm_var": 0.00511625607808431, "learning_rate": 0.0001, "loss": 1.716, "loss/crossentropy": 2.38981294631958, "loss/fcd": 1.51171875, "loss/idx": 7.5, "loss/logits": 0.2043071985244751, "step": 1915 }, { "epoch": 0.028610038898304452, "grad_norm": 0.296875, "grad_norm_var": 0.005151875813802083, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.3913618326187134, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.17914563417434692, "step": 1916 }, { "epoch": 0.028624971068919435, "grad_norm": 0.2890625, "grad_norm_var": 0.0050694783528645836, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.6311800479888916, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.1772254854440689, "step": 1917 }, { "epoch": 0.028639903239534414, "grad_norm": 0.26953125, "grad_norm_var": 0.005140622456868489, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.5183615684509277, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.1736455112695694, "step": 1918 }, { "epoch": 0.028654835410149397, "grad_norm": 0.33984375, "grad_norm_var": 0.005132993062337239, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.4683438539505005, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.1911604180932045, "step": 1919 }, { "epoch": 0.028669767580764376, "grad_norm": 0.33203125, "grad_norm_var": 0.004472096761067708, "learning_rate": 0.0001, "loss": 1.6351, "loss/crossentropy": 2.519033670425415, "loss/fcd": 1.41015625, "loss/idx": 7.5, "loss/logits": 0.2249896377325058, "step": 1920 }, { "epoch": 0.02868469975137936, "grad_norm": 0.2734375, "grad_norm_var": 0.004318602879842122, "learning_rate": 0.0001, "loss": 1.4548, "loss/crossentropy": 2.7004140615463257, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.17741407454013824, "step": 1921 }, { "epoch": 0.028699631921994342, "grad_norm": 0.291015625, "grad_norm_var": 0.004328664143880208, "learning_rate": 0.0001, "loss": 1.5395, "loss/crossentropy": 2.0922393798828125, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.18795033544301987, "step": 1922 }, { "epoch": 0.02871456409260932, "grad_norm": 0.326171875, "grad_norm_var": 0.00431976318359375, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.483629822731018, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.1987830325961113, "step": 1923 }, { "epoch": 0.028729496263224304, "grad_norm": 0.2451171875, "grad_norm_var": 0.0022361397743225096, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.404491424560547, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.19889789074659348, "step": 1924 }, { "epoch": 0.028744428433839286, "grad_norm": 0.310546875, "grad_norm_var": 0.0021317124366760252, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.7174516916275024, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.1820102334022522, "step": 1925 }, { "epoch": 0.028759360604454266, "grad_norm": 0.287109375, "grad_norm_var": 0.0019101738929748536, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.6432723999023438, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17692308872938156, "step": 1926 }, { "epoch": 0.02877429277506925, "grad_norm": 0.296875, "grad_norm_var": 0.0018213232358296712, "learning_rate": 0.0001, "loss": 1.6536, "loss/crossentropy": 2.570428252220154, "loss/fcd": 1.43359375, "loss/idx": 7.5, "loss/logits": 0.22002588212490082, "step": 1927 }, { "epoch": 0.02878922494568423, "grad_norm": 0.359375, "grad_norm_var": 0.001985418796539307, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.6276506185531616, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.1739986315369606, "step": 1928 }, { "epoch": 0.02880415711629921, "grad_norm": 0.267578125, "grad_norm_var": 0.002077384789784749, "learning_rate": 0.0001, "loss": 1.434, "loss/crossentropy": 2.4807995557785034, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.18403884023427963, "step": 1929 }, { "epoch": 0.028819089286914193, "grad_norm": 0.2890625, "grad_norm_var": 0.0020510633786519367, "learning_rate": 0.0001, "loss": 1.5259, "loss/crossentropy": 2.31441867351532, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.18998070061206818, "step": 1930 }, { "epoch": 0.028834021457529172, "grad_norm": 0.29296875, "grad_norm_var": 0.0008745153745015462, "learning_rate": 0.0001, "loss": 1.7168, "loss/crossentropy": 2.4237349033355713, "loss/fcd": 1.46484375, "loss/idx": 7.5, "loss/logits": 0.251992866396904, "step": 1931 }, { "epoch": 0.028848953628144155, "grad_norm": 0.283203125, "grad_norm_var": 0.0008880893389383951, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.7782613039016724, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.17678172141313553, "step": 1932 }, { "epoch": 0.028863885798759138, "grad_norm": 0.28515625, "grad_norm_var": 0.0008932073911031087, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.5270785093307495, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.18764495104551315, "step": 1933 }, { "epoch": 0.028878817969374117, "grad_norm": 0.322265625, "grad_norm_var": 0.0008751829465230306, "learning_rate": 0.0001, "loss": 1.5552, "loss/crossentropy": 2.506539225578308, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.20360128581523895, "step": 1934 }, { "epoch": 0.0288937501399891, "grad_norm": 0.306640625, "grad_norm_var": 0.0007681806882222493, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.6506450176239014, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20314672589302063, "step": 1935 }, { "epoch": 0.028908682310604082, "grad_norm": 0.279296875, "grad_norm_var": 0.0007029493649800618, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.7774420976638794, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.19157731533050537, "step": 1936 }, { "epoch": 0.02892361448121906, "grad_norm": 0.3828125, "grad_norm_var": 0.001139986515045166, "learning_rate": 0.0001, "loss": 2.0609, "loss/crossentropy": 2.687682032585144, "loss/fcd": 1.69140625, "loss/idx": 7.5, "loss/logits": 0.3694523721933365, "step": 1937 }, { "epoch": 0.028938546651834044, "grad_norm": 0.34375, "grad_norm_var": 0.0012395501136779786, "learning_rate": 0.0001, "loss": 1.6571, "loss/crossentropy": 2.470700263977051, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.20788709819316864, "step": 1938 }, { "epoch": 0.028953478822449027, "grad_norm": 0.330078125, "grad_norm_var": 0.0012515981992085774, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.6560239791870117, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.17812135815620422, "step": 1939 }, { "epoch": 0.028968410993064006, "grad_norm": 0.30078125, "grad_norm_var": 0.0009999593098958333, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.846352458000183, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.18193431943655014, "step": 1940 }, { "epoch": 0.02898334316367899, "grad_norm": 0.29296875, "grad_norm_var": 0.00101469357808431, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.766597867012024, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.2155774012207985, "step": 1941 }, { "epoch": 0.028998275334293968, "grad_norm": 0.2890625, "grad_norm_var": 0.0010096232096354167, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.692010760307312, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.21035537868738174, "step": 1942 }, { "epoch": 0.02901320750490895, "grad_norm": 0.29296875, "grad_norm_var": 0.0010161717732747396, "learning_rate": 0.0001, "loss": 1.5556, "loss/crossentropy": 2.5992730855941772, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.1961953192949295, "step": 1943 }, { "epoch": 0.029028139675523933, "grad_norm": 0.33203125, "grad_norm_var": 0.0008733113606770833, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.6337616443634033, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.19485964626073837, "step": 1944 }, { "epoch": 0.029043071846138913, "grad_norm": 0.255859375, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.5774269104003906, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.16705790162086487, "step": 1945 }, { "epoch": 0.029058004016753895, "grad_norm": 0.27734375, "grad_norm_var": 0.0009747823079427083, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.6940470933914185, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.21767432987689972, "step": 1946 }, { "epoch": 0.029072936187368878, "grad_norm": 0.353515625, "grad_norm_var": 0.0011132399241129557, "learning_rate": 0.0001, "loss": 1.5958, "loss/crossentropy": 2.9539268016815186, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.22076428681612015, "step": 1947 }, { "epoch": 0.029087868357983857, "grad_norm": 0.265625, "grad_norm_var": 0.001190630594889323, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.6203267574310303, "loss/fcd": 1.2265625, "loss/idx": 7.5, "loss/logits": 0.16960347443819046, "step": 1948 }, { "epoch": 0.02910280052859884, "grad_norm": 0.298828125, "grad_norm_var": 0.00116270383199056, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.611543297767639, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17885488271713257, "step": 1949 }, { "epoch": 0.029117732699213823, "grad_norm": 0.255859375, "grad_norm_var": 0.0013096968332926432, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.619404435157776, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.22386223822832108, "step": 1950 }, { "epoch": 0.029132664869828802, "grad_norm": 0.275390625, "grad_norm_var": 0.00135801633199056, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.743823528289795, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16652023047208786, "step": 1951 }, { "epoch": 0.029147597040443785, "grad_norm": 0.326171875, "grad_norm_var": 0.0013557275136311848, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.8115499019622803, "loss/fcd": 1.46875, "loss/idx": 7.5, "loss/logits": 0.2541176527738571, "step": 1952 }, { "epoch": 0.029162529211058767, "grad_norm": 0.294921875, "grad_norm_var": 0.0009215672810872396, "learning_rate": 0.0001, "loss": 1.5088, "loss/crossentropy": 2.8607544898986816, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.18454967439174652, "step": 1953 }, { "epoch": 0.029177461381673746, "grad_norm": 0.3125, "grad_norm_var": 0.0007964452107747396, "learning_rate": 0.0001, "loss": 1.5189, "loss/crossentropy": 2.498580813407898, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.1829310953617096, "step": 1954 }, { "epoch": 0.02919239355228873, "grad_norm": 0.31640625, "grad_norm_var": 0.0007480462392171224, "learning_rate": 0.0001, "loss": 1.4689, "loss/crossentropy": 2.5287530422210693, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17985684424638748, "step": 1955 }, { "epoch": 0.02920732572290371, "grad_norm": 0.390625, "grad_norm_var": 0.0013066450754801431, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 2.7909224033355713, "loss/fcd": 1.44140625, "loss/idx": 7.5, "loss/logits": 0.19726381450891495, "step": 1956 }, { "epoch": 0.02922225789351869, "grad_norm": 0.296875, "grad_norm_var": 0.001302957534790039, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.6084100008010864, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.20470361411571503, "step": 1957 }, { "epoch": 0.029237190064133674, "grad_norm": 0.314453125, "grad_norm_var": 0.0012990315755208334, "learning_rate": 0.0001, "loss": 1.7085, "loss/crossentropy": 2.459324359893799, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.24756821244955063, "step": 1958 }, { "epoch": 0.029252122234748653, "grad_norm": 0.28125, "grad_norm_var": 0.0013243993123372396, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.470743775367737, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.16190864145755768, "step": 1959 }, { "epoch": 0.029267054405363636, "grad_norm": 0.306640625, "grad_norm_var": 0.001266336441040039, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.5596195459365845, "loss/fcd": 1.39453125, "loss/idx": 7.5, "loss/logits": 0.2385719120502472, "step": 1960 }, { "epoch": 0.02928198657597862, "grad_norm": 0.30078125, "grad_norm_var": 0.0011197408040364584, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.8690768480300903, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.21691838651895523, "step": 1961 }, { "epoch": 0.029296918746593598, "grad_norm": 0.28125, "grad_norm_var": 0.001106707255045573, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.466285824775696, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.16250278055667877, "step": 1962 }, { "epoch": 0.02931185091720858, "grad_norm": 0.365234375, "grad_norm_var": 0.0011919657389322916, "learning_rate": 0.0001, "loss": 1.7176, "loss/crossentropy": 2.648834228515625, "loss/fcd": 1.4765625, "loss/idx": 7.5, "loss/logits": 0.24099178612232208, "step": 1963 }, { "epoch": 0.029326783087823563, "grad_norm": 0.296875, "grad_norm_var": 0.0010882059733072916, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.532212018966675, "loss/fcd": 1.2109375, "loss/idx": 7.5, "loss/logits": 0.15824826806783676, "step": 1964 }, { "epoch": 0.029341715258438542, "grad_norm": 0.3203125, "grad_norm_var": 0.001093276341756185, "learning_rate": 0.0001, "loss": 1.6864, "loss/crossentropy": 2.49002468585968, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.26066526770591736, "step": 1965 }, { "epoch": 0.029356647429053525, "grad_norm": 0.28515625, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.5018492937088013, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1980273500084877, "step": 1966 }, { "epoch": 0.029371579599668504, "grad_norm": 0.287109375, "grad_norm_var": 0.0008954366048177083, "learning_rate": 0.0001, "loss": 1.434, "loss/crossentropy": 2.5343021154403687, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17231228947639465, "step": 1967 }, { "epoch": 0.029386511770283487, "grad_norm": 0.37109375, "grad_norm_var": 0.001112222671508789, "learning_rate": 0.0001, "loss": 1.6817, "loss/crossentropy": 2.352415919303894, "loss/fcd": 1.453125, "loss/idx": 7.5, "loss/logits": 0.22859769314527512, "step": 1968 }, { "epoch": 0.02940144394089847, "grad_norm": 0.30859375, "grad_norm_var": 0.0010894139607747395, "learning_rate": 0.0001, "loss": 1.4004, "loss/crossentropy": 2.6096965074539185, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.16599421203136444, "step": 1969 }, { "epoch": 0.02941637611151345, "grad_norm": 0.26171875, "grad_norm_var": 0.0012654622395833334, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.629533290863037, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.15375210344791412, "step": 1970 }, { "epoch": 0.02943130828212843, "grad_norm": 0.27734375, "grad_norm_var": 0.0013353983561197917, "learning_rate": 0.0001, "loss": 1.2961, "loss/crossentropy": 2.737032413482666, "loss/fcd": 1.14453125, "loss/idx": 7.5, "loss/logits": 0.15156958997249603, "step": 1971 }, { "epoch": 0.029446240452743414, "grad_norm": 0.33203125, "grad_norm_var": 0.0009129206339518229, "learning_rate": 0.0001, "loss": 1.4493, "loss/crossentropy": 2.472022771835327, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.16024889796972275, "step": 1972 }, { "epoch": 0.029461172623358393, "grad_norm": 0.2890625, "grad_norm_var": 0.0009256362915039063, "learning_rate": 0.0001, "loss": 1.5485, "loss/crossentropy": 2.567961573600769, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.21256595849990845, "step": 1973 }, { "epoch": 0.029476104793973376, "grad_norm": 0.287109375, "grad_norm_var": 0.000937652587890625, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.444976806640625, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.1844826564192772, "step": 1974 }, { "epoch": 0.02949103696458836, "grad_norm": 0.25390625, "grad_norm_var": 0.0010644912719726563, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.696893095970154, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.17225909233093262, "step": 1975 }, { "epoch": 0.029505969135203338, "grad_norm": 0.298828125, "grad_norm_var": 0.0010629653930664062, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.5711617469787598, "loss/fcd": 1.19921875, "loss/idx": 7.5, "loss/logits": 0.15751300007104874, "step": 1976 }, { "epoch": 0.02952090130581832, "grad_norm": 0.28125, "grad_norm_var": 0.0010874430338541667, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.373708963394165, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.18534115701913834, "step": 1977 }, { "epoch": 0.029535833476433303, "grad_norm": 0.302734375, "grad_norm_var": 0.0010631402333577474, "learning_rate": 0.0001, "loss": 1.7471, "loss/crossentropy": 2.311202645301819, "loss/fcd": 1.49609375, "loss/idx": 7.5, "loss/logits": 0.2510295584797859, "step": 1978 }, { "epoch": 0.029550765647048283, "grad_norm": 0.8828125, "grad_norm_var": 0.022228749593098958, "learning_rate": 0.0001, "loss": 1.7238, "loss/crossentropy": 2.7768548727035522, "loss/fcd": 1.48828125, "loss/idx": 7.5, "loss/logits": 0.2354755625128746, "step": 1979 }, { "epoch": 0.029565697817663265, "grad_norm": 0.328125, "grad_norm_var": 0.022137196858723958, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.6792709827423096, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.20392261445522308, "step": 1980 }, { "epoch": 0.029580629988278245, "grad_norm": 0.2490234375, "grad_norm_var": 0.02259870767593384, "learning_rate": 0.0001, "loss": 1.3753, "loss/crossentropy": 2.567617893218994, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.17222069203853607, "step": 1981 }, { "epoch": 0.029595562158893227, "grad_norm": 0.3515625, "grad_norm_var": 0.022468467553456623, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.656588912010193, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.2179877832531929, "step": 1982 }, { "epoch": 0.02961049432950821, "grad_norm": 0.28125, "grad_norm_var": 0.022508140405019125, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.546592593193054, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.18279610574245453, "step": 1983 }, { "epoch": 0.02962542650012319, "grad_norm": 0.3203125, "grad_norm_var": 0.022423422336578368, "learning_rate": 0.0001, "loss": 1.6634, "loss/crossentropy": 2.407332420349121, "loss/fcd": 1.453125, "loss/idx": 7.5, "loss/logits": 0.21023399382829666, "step": 1984 }, { "epoch": 0.029640358670738172, "grad_norm": 0.298828125, "grad_norm_var": 0.022459344069163004, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.2389989495277405, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.18500903248786926, "step": 1985 }, { "epoch": 0.029655290841353155, "grad_norm": 0.287109375, "grad_norm_var": 0.02226511240005493, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.5988144874572754, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.1958768144249916, "step": 1986 }, { "epoch": 0.029670223011968134, "grad_norm": 0.263671875, "grad_norm_var": 0.02237748702367147, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.69570255279541, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.1984579861164093, "step": 1987 }, { "epoch": 0.029685155182583117, "grad_norm": 0.314453125, "grad_norm_var": 0.02239608367284139, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.6110461950302124, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20521266758441925, "step": 1988 }, { "epoch": 0.0297000873531981, "grad_norm": 0.31640625, "grad_norm_var": 0.022291274865468343, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.859120726585388, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.15436360239982605, "step": 1989 }, { "epoch": 0.02971501952381308, "grad_norm": 0.31640625, "grad_norm_var": 0.022168250878651936, "learning_rate": 0.0001, "loss": 1.5489, "loss/crossentropy": 2.4848233461380005, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1973266825079918, "step": 1990 }, { "epoch": 0.02972995169442806, "grad_norm": 0.26953125, "grad_norm_var": 0.02201629877090454, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.53887939453125, "loss/fcd": 1.16015625, "loss/idx": 7.5, "loss/logits": 0.15087512135505676, "step": 1991 }, { "epoch": 0.02974488386504304, "grad_norm": 0.32421875, "grad_norm_var": 0.021933646996816, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.592678189277649, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.188975490629673, "step": 1992 }, { "epoch": 0.029759816035658023, "grad_norm": 0.306640625, "grad_norm_var": 0.02178611358006795, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.6848541498184204, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.1906232088804245, "step": 1993 }, { "epoch": 0.029774748206273006, "grad_norm": 0.291015625, "grad_norm_var": 0.021850295861562095, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.8219099044799805, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17891719192266464, "step": 1994 }, { "epoch": 0.029789680376887985, "grad_norm": 0.3203125, "grad_norm_var": 0.0007336576779683431, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.4911776781082153, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.20711027085781097, "step": 1995 }, { "epoch": 0.029804612547502968, "grad_norm": 0.31640625, "grad_norm_var": 0.0007020910580952962, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.567915916442871, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.1987716257572174, "step": 1996 }, { "epoch": 0.02981954471811795, "grad_norm": 0.287109375, "grad_norm_var": 0.0005252679189046224, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.614750385284424, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1844882145524025, "step": 1997 }, { "epoch": 0.02983447688873293, "grad_norm": 0.33984375, "grad_norm_var": 0.0004596551259358724, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.769694447517395, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.18861094117164612, "step": 1998 }, { "epoch": 0.029849409059347912, "grad_norm": 0.27734375, "grad_norm_var": 0.0004721164703369141, "learning_rate": 0.0001, "loss": 1.5203, "loss/crossentropy": 2.5993359088897705, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.19222304224967957, "step": 1999 }, { "epoch": 0.029864341229962895, "grad_norm": 0.26171875, "grad_norm_var": 0.000552225112915039, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.7430869340896606, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.2025001049041748, "step": 2000 }, { "epoch": 0.029879273400577874, "grad_norm": 0.31640625, "grad_norm_var": 0.0005701065063476562, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.5706042051315308, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.18146125972270966, "step": 2001 }, { "epoch": 0.029894205571192857, "grad_norm": 0.27734375, "grad_norm_var": 0.0005935509999593099, "learning_rate": 0.0001, "loss": 1.3009, "loss/crossentropy": 2.4674673080444336, "loss/fcd": 1.1640625, "loss/idx": 7.5, "loss/logits": 0.13687089458107948, "step": 2002 }, { "epoch": 0.029909137741807836, "grad_norm": 0.30078125, "grad_norm_var": 0.0005002339680989583, "learning_rate": 0.0001, "loss": 1.5865, "loss/crossentropy": 2.5412967205047607, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20754456520080566, "step": 2003 }, { "epoch": 0.02992406991242282, "grad_norm": 0.271484375, "grad_norm_var": 0.0005456924438476563, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.6974633932113647, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17905978858470917, "step": 2004 }, { "epoch": 0.0299390020830378, "grad_norm": 0.306640625, "grad_norm_var": 0.0005297183990478516, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.7850738763809204, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.1746155545115471, "step": 2005 }, { "epoch": 0.02995393425365278, "grad_norm": 0.27734375, "grad_norm_var": 0.0005341688791910807, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.6332948207855225, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16399270296096802, "step": 2006 }, { "epoch": 0.029968866424267764, "grad_norm": 0.27734375, "grad_norm_var": 0.0005098819732666015, "learning_rate": 0.0001, "loss": 1.4224, "loss/crossentropy": 2.5699846744537354, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.16462134569883347, "step": 2007 }, { "epoch": 0.029983798594882746, "grad_norm": 0.34765625, "grad_norm_var": 0.0006292819976806641, "learning_rate": 0.0001, "loss": 1.8109, "loss/crossentropy": 2.5490691661834717, "loss/fcd": 1.5625, "loss/idx": 7.5, "loss/logits": 0.2484118416905403, "step": 2008 }, { "epoch": 0.029998730765497725, "grad_norm": 0.306640625, "grad_norm_var": 0.0006292819976806641, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.4754668474197388, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.17154840379953384, "step": 2009 }, { "epoch": 0.030013662936112708, "grad_norm": 0.4921875, "grad_norm_var": 0.0029589335123697915, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.639050245285034, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.1652303785085678, "step": 2010 }, { "epoch": 0.03002859510672769, "grad_norm": 0.69140625, "grad_norm_var": 0.012024879455566406, "learning_rate": 0.0001, "loss": 1.6818, "loss/crossentropy": 2.6371554136276245, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20909534394741058, "step": 2011 }, { "epoch": 0.03004352727734267, "grad_norm": 0.55078125, "grad_norm_var": 0.014901161193847656, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.538976550102234, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.18562434613704681, "step": 2012 }, { "epoch": 0.030058459447957653, "grad_norm": 0.68359375, "grad_norm_var": 0.021460835138956705, "learning_rate": 0.0001, "loss": 1.9555, "loss/crossentropy": 2.738535761833191, "loss/fcd": 1.69921875, "loss/idx": 8.0, "loss/logits": 0.25624898076057434, "step": 2013 }, { "epoch": 0.030073391618572635, "grad_norm": 0.50390625, "grad_norm_var": 0.022403446833292644, "learning_rate": 0.0001, "loss": 1.534, "loss/crossentropy": 2.6512542963027954, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.17459186911582947, "step": 2014 }, { "epoch": 0.030088323789187615, "grad_norm": 0.48046875, "grad_norm_var": 0.022095982233683267, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.735915184020996, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.17435404658317566, "step": 2015 }, { "epoch": 0.030103255959802597, "grad_norm": 0.50390625, "grad_norm_var": 0.021406157811482748, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.6755112409591675, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19315673410892487, "step": 2016 }, { "epoch": 0.030118188130417577, "grad_norm": 0.5, "grad_norm_var": 0.021179056167602538, "learning_rate": 0.0001, "loss": 1.6325, "loss/crossentropy": 2.568297863006592, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.19503623992204666, "step": 2017 }, { "epoch": 0.03013312030103256, "grad_norm": 0.458984375, "grad_norm_var": 0.019708251953125, "learning_rate": 0.0001, "loss": 1.5839, "loss/crossentropy": 2.6288214921951294, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.18933219462633133, "step": 2018 }, { "epoch": 0.030148052471647542, "grad_norm": 0.478515625, "grad_norm_var": 0.01851207415262858, "learning_rate": 0.0001, "loss": 1.6099, "loss/crossentropy": 2.5832252502441406, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.18023230880498886, "step": 2019 }, { "epoch": 0.03016298464226252, "grad_norm": 0.8359375, "grad_norm_var": 0.025315093994140624, "learning_rate": 0.0001, "loss": 1.9554, "loss/crossentropy": 2.8931599855422974, "loss/fcd": 1.70703125, "loss/idx": 8.0, "loss/logits": 0.24832381308078766, "step": 2020 }, { "epoch": 0.030177916812877504, "grad_norm": 0.455078125, "grad_norm_var": 0.0232421875, "learning_rate": 0.0001, "loss": 1.6765, "loss/crossentropy": 2.558019757270813, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.2077295109629631, "step": 2021 }, { "epoch": 0.030192848983492487, "grad_norm": 0.478515625, "grad_norm_var": 0.020061222712198894, "learning_rate": 0.0001, "loss": 1.66, "loss/crossentropy": 2.8639663457870483, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.21081604808568954, "step": 2022 }, { "epoch": 0.030207781154107466, "grad_norm": 0.56640625, "grad_norm_var": 0.016593790054321288, "learning_rate": 0.0001, "loss": 1.6434, "loss/crossentropy": 2.5051262378692627, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.1941494643688202, "step": 2023 }, { "epoch": 0.03022271332472245, "grad_norm": 0.5390625, "grad_norm_var": 0.014462900161743165, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.718661069869995, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.16365427523851395, "step": 2024 }, { "epoch": 0.03023764549533743, "grad_norm": 0.55859375, "grad_norm_var": 0.010831642150878906, "learning_rate": 0.0001, "loss": 1.9914, "loss/crossentropy": 2.397466778755188, "loss/fcd": 1.70703125, "loss/idx": 8.0, "loss/logits": 0.28435733169317245, "step": 2025 }, { "epoch": 0.03025257766595241, "grad_norm": 0.453125, "grad_norm_var": 0.011220741271972656, "learning_rate": 0.0001, "loss": 1.5968, "loss/crossentropy": 2.455097198486328, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.178861565887928, "step": 2026 }, { "epoch": 0.030267509836567393, "grad_norm": 0.4765625, "grad_norm_var": 0.009944407145182292, "learning_rate": 0.0001, "loss": 1.6487, "loss/crossentropy": 2.446172595024109, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.19163452088832855, "step": 2027 }, { "epoch": 0.030282442007182372, "grad_norm": 1.453125, "grad_norm_var": 0.06300703684488933, "learning_rate": 0.0001, "loss": 2.0797, "loss/crossentropy": 2.5350029468536377, "loss/fcd": 1.8046875, "loss/idx": 8.0, "loss/logits": 0.27498847246170044, "step": 2028 }, { "epoch": 0.030297374177797355, "grad_norm": 0.421875, "grad_norm_var": 0.06399103800455729, "learning_rate": 0.0001, "loss": 1.5441, "loss/crossentropy": 2.6789642572402954, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18083027750253677, "step": 2029 }, { "epoch": 0.030312306348412338, "grad_norm": 0.47265625, "grad_norm_var": 0.06433893839518229, "learning_rate": 0.0001, "loss": 1.8213, "loss/crossentropy": 2.6662744283676147, "loss/fcd": 1.5625, "loss/idx": 8.0, "loss/logits": 0.25878019630908966, "step": 2030 }, { "epoch": 0.030327238519027317, "grad_norm": 0.443359375, "grad_norm_var": 0.06487196286519369, "learning_rate": 0.0001, "loss": 1.7545, "loss/crossentropy": 2.5721651315689087, "loss/fcd": 1.53515625, "loss/idx": 8.0, "loss/logits": 0.21935239434242249, "step": 2031 }, { "epoch": 0.0303421706896423, "grad_norm": 0.52734375, "grad_norm_var": 0.06470449765523274, "learning_rate": 0.0001, "loss": 1.6393, "loss/crossentropy": 2.525114059448242, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.19008611142635345, "step": 2032 }, { "epoch": 0.030357102860257282, "grad_norm": 0.4609375, "grad_norm_var": 0.06516416867574056, "learning_rate": 0.0001, "loss": 1.6858, "loss/crossentropy": 2.3392094373703003, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.21701061725616455, "step": 2033 }, { "epoch": 0.03037203503087226, "grad_norm": 0.484375, "grad_norm_var": 0.06483707427978516, "learning_rate": 0.0001, "loss": 1.7443, "loss/crossentropy": 2.6104544401168823, "loss/fcd": 1.51953125, "loss/idx": 8.0, "loss/logits": 0.22481046617031097, "step": 2034 }, { "epoch": 0.030386967201487244, "grad_norm": 0.4921875, "grad_norm_var": 0.06468364397684732, "learning_rate": 0.0001, "loss": 1.6242, "loss/crossentropy": 2.7361948490142822, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.19845832884311676, "step": 2035 }, { "epoch": 0.030401899372102227, "grad_norm": 0.52734375, "grad_norm_var": 0.05969109535217285, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.759740471839905, "loss/fcd": 1.46484375, "loss/idx": 8.0, "loss/logits": 0.2115342915058136, "step": 2036 }, { "epoch": 0.030416831542717206, "grad_norm": 0.419921875, "grad_norm_var": 0.06021637916564941, "learning_rate": 0.0001, "loss": 1.5598, "loss/crossentropy": 2.6317158937454224, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.17306915670633316, "step": 2037 }, { "epoch": 0.03043176371333219, "grad_norm": 0.408203125, "grad_norm_var": 0.06118111610412598, "learning_rate": 0.0001, "loss": 1.7399, "loss/crossentropy": 2.4931308031082153, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.22813451290130615, "step": 2038 }, { "epoch": 0.030446695883947168, "grad_norm": 0.51171875, "grad_norm_var": 0.061205148696899414, "learning_rate": 0.0001, "loss": 1.7832, "loss/crossentropy": 2.579059600830078, "loss/fcd": 1.5546875, "loss/idx": 8.0, "loss/logits": 0.22846680879592896, "step": 2039 }, { "epoch": 0.03046162805456215, "grad_norm": 0.6953125, "grad_norm_var": 0.06269796689351399, "learning_rate": 0.0001, "loss": 1.9305, "loss/crossentropy": 2.724112033843994, "loss/fcd": 1.671875, "loss/idx": 8.0, "loss/logits": 0.25861097127199173, "step": 2040 }, { "epoch": 0.030476560225177134, "grad_norm": 0.421875, "grad_norm_var": 0.06371712684631348, "learning_rate": 0.0001, "loss": 1.525, "loss/crossentropy": 2.5424829721450806, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.18127263337373734, "step": 2041 }, { "epoch": 0.030491492395792113, "grad_norm": 0.431640625, "grad_norm_var": 0.06400019327799479, "learning_rate": 0.0001, "loss": 1.5612, "loss/crossentropy": 2.6636070013046265, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.18225979059934616, "step": 2042 }, { "epoch": 0.030506424566407096, "grad_norm": 0.3984375, "grad_norm_var": 0.06504796346028646, "learning_rate": 0.0001, "loss": 1.6068, "loss/crossentropy": 2.5607352256774902, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.18880213797092438, "step": 2043 }, { "epoch": 0.03052135673702208, "grad_norm": 0.388671875, "grad_norm_var": 0.005648914972941081, "learning_rate": 0.0001, "loss": 1.6389, "loss/crossentropy": 2.469869613647461, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.19749997556209564, "step": 2044 }, { "epoch": 0.030536288907637057, "grad_norm": 0.412109375, "grad_norm_var": 0.005716387430826823, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.6016329526901245, "loss/fcd": 1.29296875, "loss/idx": 8.0, "loss/logits": 0.16200660169124603, "step": 2045 }, { "epoch": 0.03055122107825204, "grad_norm": 0.384765625, "grad_norm_var": 0.006150547663370768, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.429213047027588, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.17000386118888855, "step": 2046 }, { "epoch": 0.030566153248867023, "grad_norm": 0.4453125, "grad_norm_var": 0.006145668029785156, "learning_rate": 0.0001, "loss": 1.6709, "loss/crossentropy": 2.571095824241638, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.21386200934648514, "step": 2047 }, { "epoch": 0.030581085419482002, "grad_norm": 0.3984375, "grad_norm_var": 0.00608062744140625, "learning_rate": 0.0001, "loss": 1.554, "loss/crossentropy": 2.332857370376587, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.16728917509317398, "step": 2048 }, { "epoch": 0.030596017590096985, "grad_norm": 0.40625, "grad_norm_var": 0.006224822998046875, "learning_rate": 0.0001, "loss": 1.5855, "loss/crossentropy": 2.5588884353637695, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.19485174119472504, "step": 2049 }, { "epoch": 0.030610949760711968, "grad_norm": 0.6015625, "grad_norm_var": 0.00759429931640625, "learning_rate": 0.0001, "loss": 1.7673, "loss/crossentropy": 2.650652766227722, "loss/fcd": 1.5390625, "loss/idx": 8.0, "loss/logits": 0.228206567466259, "step": 2050 }, { "epoch": 0.030625881931326947, "grad_norm": 0.5078125, "grad_norm_var": 0.0076787312825520836, "learning_rate": 0.0001, "loss": 1.8268, "loss/crossentropy": 2.2954814434051514, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.24084149301052094, "step": 2051 }, { "epoch": 0.03064081410194193, "grad_norm": 0.44921875, "grad_norm_var": 0.0073582967122395836, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.6009691953659058, "loss/fcd": 1.41015625, "loss/idx": 8.0, "loss/logits": 0.18451987951993942, "step": 2052 }, { "epoch": 0.03065574627255691, "grad_norm": 0.5390625, "grad_norm_var": 0.007686980565388997, "learning_rate": 0.0001, "loss": 1.6462, "loss/crossentropy": 2.6434242725372314, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.19302675873041153, "step": 2053 }, { "epoch": 0.03067067844317189, "grad_norm": 0.41015625, "grad_norm_var": 0.007673072814941406, "learning_rate": 0.0001, "loss": 1.5812, "loss/crossentropy": 2.7117305994033813, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.19056762754917145, "step": 2054 }, { "epoch": 0.030685610613786874, "grad_norm": 0.408203125, "grad_norm_var": 0.007665491104125977, "learning_rate": 0.0001, "loss": 1.7181, "loss/crossentropy": 2.651281714439392, "loss/fcd": 1.5078125, "loss/idx": 8.0, "loss/logits": 0.21033576875925064, "step": 2055 }, { "epoch": 0.030700542784401853, "grad_norm": 0.408203125, "grad_norm_var": 0.0036630630493164062, "learning_rate": 0.0001, "loss": 1.5785, "loss/crossentropy": 2.728546619415283, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19182241708040237, "step": 2056 }, { "epoch": 0.030715474955016836, "grad_norm": 0.431640625, "grad_norm_var": 0.0036477247873942056, "learning_rate": 0.0001, "loss": 1.59, "loss/crossentropy": 2.5823835134506226, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.18379104137420654, "step": 2057 }, { "epoch": 0.03073040712563182, "grad_norm": 0.5, "grad_norm_var": 0.0038741429646809897, "learning_rate": 0.0001, "loss": 1.7259, "loss/crossentropy": 2.8470664024353027, "loss/fcd": 1.515625, "loss/idx": 8.0, "loss/logits": 0.21027395129203796, "step": 2058 }, { "epoch": 0.030745339296246798, "grad_norm": 0.484375, "grad_norm_var": 0.0038237889607747396, "learning_rate": 0.0001, "loss": 1.6979, "loss/crossentropy": 2.735305428504944, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.22134840488433838, "step": 2059 }, { "epoch": 0.03076027146686178, "grad_norm": 0.4921875, "grad_norm_var": 0.0036679426829020183, "learning_rate": 0.0001, "loss": 1.7422, "loss/crossentropy": 2.470483183860779, "loss/fcd": 1.53125, "loss/idx": 8.0, "loss/logits": 0.21097531914710999, "step": 2060 }, { "epoch": 0.030775203637476763, "grad_norm": 0.46484375, "grad_norm_var": 0.003540484110514323, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.5108784437179565, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.17310867458581924, "step": 2061 }, { "epoch": 0.030790135808091743, "grad_norm": 0.404296875, "grad_norm_var": 0.003372955322265625, "learning_rate": 0.0001, "loss": 1.5464, "loss/crossentropy": 2.5907052755355835, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.1752755492925644, "step": 2062 }, { "epoch": 0.030805067978706725, "grad_norm": 0.439453125, "grad_norm_var": 0.0033861637115478516, "learning_rate": 0.0001, "loss": 1.5666, "loss/crossentropy": 2.8992494344711304, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.17206592112779617, "step": 2063 }, { "epoch": 0.030820000149321704, "grad_norm": 0.419921875, "grad_norm_var": 0.0032412211100260415, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.5843944549560547, "loss/fcd": 1.484375, "loss/idx": 8.0, "loss/logits": 0.20618122071027756, "step": 2064 }, { "epoch": 0.030834932319936687, "grad_norm": 0.380859375, "grad_norm_var": 0.003465000788370768, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.77576220035553, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.17361772060394287, "step": 2065 }, { "epoch": 0.03084986449055167, "grad_norm": 0.4140625, "grad_norm_var": 0.0020947615305582684, "learning_rate": 0.0001, "loss": 1.6511, "loss/crossentropy": 2.7077430486679077, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.19799936562776566, "step": 2066 }, { "epoch": 0.03086479666116665, "grad_norm": 0.42578125, "grad_norm_var": 0.0018517653147379558, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.6036940813064575, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.1716543808579445, "step": 2067 }, { "epoch": 0.030879728831781632, "grad_norm": 0.451171875, "grad_norm_var": 0.0018538792928059896, "learning_rate": 0.0001, "loss": 1.6308, "loss/crossentropy": 2.531284213066101, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.18552187085151672, "step": 2068 }, { "epoch": 0.030894661002396615, "grad_norm": 0.40625, "grad_norm_var": 0.0012399673461914063, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.56545627117157, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.19586841017007828, "step": 2069 }, { "epoch": 0.030909593173011594, "grad_norm": 0.48046875, "grad_norm_var": 0.0013269424438476563, "learning_rate": 0.0001, "loss": 1.7749, "loss/crossentropy": 2.674584984779358, "loss/fcd": 1.5390625, "loss/idx": 8.0, "loss/logits": 0.23580867052078247, "step": 2070 }, { "epoch": 0.030924525343626576, "grad_norm": 0.44140625, "grad_norm_var": 0.0012629032135009766, "learning_rate": 0.0001, "loss": 1.6777, "loss/crossentropy": 2.7862950563430786, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20500106364488602, "step": 2071 }, { "epoch": 0.03093945751424156, "grad_norm": 0.43359375, "grad_norm_var": 0.0011945088704427083, "learning_rate": 0.0001, "loss": 1.7364, "loss/crossentropy": 2.7037689685821533, "loss/fcd": 1.51953125, "loss/idx": 8.0, "loss/logits": 0.2168433517217636, "step": 2072 }, { "epoch": 0.03095438968485654, "grad_norm": 0.416015625, "grad_norm_var": 0.0012311299641927084, "learning_rate": 0.0001, "loss": 1.6903, "loss/crossentropy": 2.3502912521362305, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.2176072746515274, "step": 2073 }, { "epoch": 0.03096932185547152, "grad_norm": 0.59765625, "grad_norm_var": 0.002596473693847656, "learning_rate": 0.0001, "loss": 1.7199, "loss/crossentropy": 2.2926315665245056, "loss/fcd": 1.5390625, "loss/idx": 8.0, "loss/logits": 0.1808125004172325, "step": 2074 }, { "epoch": 0.030984254026086504, "grad_norm": 0.6640625, "grad_norm_var": 0.005509376525878906, "learning_rate": 0.0001, "loss": 1.7191, "loss/crossentropy": 2.4893484115600586, "loss/fcd": 1.53125, "loss/idx": 8.0, "loss/logits": 0.18783427774906158, "step": 2075 }, { "epoch": 0.030999186196701483, "grad_norm": 0.4609375, "grad_norm_var": 0.00542901357014974, "learning_rate": 0.0001, "loss": 1.6299, "loss/crossentropy": 2.6521389484405518, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.19627613574266434, "step": 2076 }, { "epoch": 0.031014118367316466, "grad_norm": 0.40625, "grad_norm_var": 0.0055768330891927086, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.739071488380432, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.17858368903398514, "step": 2077 }, { "epoch": 0.031029050537931445, "grad_norm": 0.396484375, "grad_norm_var": 0.005631001790364584, "learning_rate": 0.0001, "loss": 1.5612, "loss/crossentropy": 2.589470148086548, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.17444124072790146, "step": 2078 }, { "epoch": 0.031043982708546428, "grad_norm": 0.546875, "grad_norm_var": 0.006170384089152018, "learning_rate": 0.0001, "loss": 1.8608, "loss/crossentropy": 2.2858415842056274, "loss/fcd": 1.6328125, "loss/idx": 8.0, "loss/logits": 0.22799421846866608, "step": 2079 }, { "epoch": 0.03105891487916141, "grad_norm": 0.439453125, "grad_norm_var": 0.00609281857808431, "learning_rate": 0.0001, "loss": 1.6194, "loss/crossentropy": 2.8109859228134155, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.20147809386253357, "step": 2080 }, { "epoch": 0.03107384704977639, "grad_norm": 0.46484375, "grad_norm_var": 0.005646514892578125, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.7435665130615234, "loss/fcd": 1.5078125, "loss/idx": 8.0, "loss/logits": 0.21414213627576828, "step": 2081 }, { "epoch": 0.031088779220391372, "grad_norm": 0.400390625, "grad_norm_var": 0.005751657485961914, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.6140815019607544, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.16453833878040314, "step": 2082 }, { "epoch": 0.031103711391006355, "grad_norm": 0.5859375, "grad_norm_var": 0.006528457005818685, "learning_rate": 0.0001, "loss": 1.7894, "loss/crossentropy": 2.7133957147598267, "loss/fcd": 1.5625, "loss/idx": 8.0, "loss/logits": 0.2269287183880806, "step": 2083 }, { "epoch": 0.031118643561621334, "grad_norm": 0.451171875, "grad_norm_var": 0.006528457005818685, "learning_rate": 0.0001, "loss": 1.7787, "loss/crossentropy": 2.4352492094039917, "loss/fcd": 1.546875, "loss/idx": 8.0, "loss/logits": 0.23179451376199722, "step": 2084 }, { "epoch": 0.031133575732236317, "grad_norm": 0.45703125, "grad_norm_var": 0.006227604548136393, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.606731414794922, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.20451100170612335, "step": 2085 }, { "epoch": 0.0311485079028513, "grad_norm": 0.369140625, "grad_norm_var": 0.006960550944010417, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.463603377342224, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.21251866221427917, "step": 2086 }, { "epoch": 0.03116344007346628, "grad_norm": 0.482421875, "grad_norm_var": 0.006905476252237956, "learning_rate": 0.0001, "loss": 1.7032, "loss/crossentropy": 2.741825819015503, "loss/fcd": 1.4921875, "loss/idx": 8.0, "loss/logits": 0.21100960671901703, "step": 2087 }, { "epoch": 0.03117837224408126, "grad_norm": 0.427734375, "grad_norm_var": 0.006938616434733073, "learning_rate": 0.0001, "loss": 1.6517, "loss/crossentropy": 2.401910662651062, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.21025388687849045, "step": 2088 }, { "epoch": 0.03119330441469624, "grad_norm": 0.439453125, "grad_norm_var": 0.006795183817545573, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.5581270456314087, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.1986994743347168, "step": 2089 }, { "epoch": 0.031208236585311223, "grad_norm": 0.390625, "grad_norm_var": 0.006070709228515625, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.7275644540786743, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.16392534226179123, "step": 2090 }, { "epoch": 0.031223168755926206, "grad_norm": 0.486328125, "grad_norm_var": 0.0032429854075113934, "learning_rate": 0.0001, "loss": 1.7185, "loss/crossentropy": 2.5507593154907227, "loss/fcd": 1.5078125, "loss/idx": 8.0, "loss/logits": 0.2106621116399765, "step": 2091 }, { "epoch": 0.031238100926541185, "grad_norm": 0.451171875, "grad_norm_var": 0.003235117594401042, "learning_rate": 0.0001, "loss": 1.6756, "loss/crossentropy": 2.4704352617263794, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20292042195796967, "step": 2092 }, { "epoch": 0.03125303309715617, "grad_norm": 0.52734375, "grad_norm_var": 0.0034499486287434896, "learning_rate": 0.0001, "loss": 1.9378, "loss/crossentropy": 2.4260478019714355, "loss/fcd": 1.671875, "loss/idx": 8.0, "loss/logits": 0.2659446597099304, "step": 2093 }, { "epoch": 0.03126796526777115, "grad_norm": 0.44921875, "grad_norm_var": 0.0031963189442952475, "learning_rate": 0.0001, "loss": 1.7063, "loss/crossentropy": 2.5973998308181763, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.22971642017364502, "step": 2094 }, { "epoch": 0.03128289743838613, "grad_norm": 0.4296875, "grad_norm_var": 0.002706130345662435, "learning_rate": 0.0001, "loss": 1.7716, "loss/crossentropy": 2.3425583839416504, "loss/fcd": 1.56640625, "loss/idx": 8.0, "loss/logits": 0.20517488569021225, "step": 2095 }, { "epoch": 0.03129782960900111, "grad_norm": 0.58984375, "grad_norm_var": 0.0038431167602539064, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.7817490100860596, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.17791728675365448, "step": 2096 }, { "epoch": 0.031312761779616095, "grad_norm": 0.439453125, "grad_norm_var": 0.0038759708404541016, "learning_rate": 0.0001, "loss": 1.7351, "loss/crossentropy": 2.650373935699463, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.223370760679245, "step": 2097 }, { "epoch": 0.03132769395023108, "grad_norm": 0.40625, "grad_norm_var": 0.003830718994140625, "learning_rate": 0.0001, "loss": 1.6517, "loss/crossentropy": 2.3085328340530396, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.19854824990034103, "step": 2098 }, { "epoch": 0.031342626120846054, "grad_norm": 0.427734375, "grad_norm_var": 0.0027685642242431642, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.8068811893463135, "loss/fcd": 1.3203125, "loss/idx": 8.0, "loss/logits": 0.15951743721961975, "step": 2099 }, { "epoch": 0.031357558291461037, "grad_norm": 0.384765625, "grad_norm_var": 0.0030474185943603514, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.620944023132324, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.17456384748220444, "step": 2100 }, { "epoch": 0.03137249046207602, "grad_norm": 0.3984375, "grad_norm_var": 0.003186655044555664, "learning_rate": 0.0001, "loss": 1.5679, "loss/crossentropy": 2.488567590713501, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.18512221425771713, "step": 2101 }, { "epoch": 0.031387422632691, "grad_norm": 0.5, "grad_norm_var": 0.0029555638631184895, "learning_rate": 0.0001, "loss": 1.6677, "loss/crossentropy": 2.4931570291519165, "loss/fcd": 1.46484375, "loss/idx": 8.0, "loss/logits": 0.2028091996908188, "step": 2102 }, { "epoch": 0.031402354803305985, "grad_norm": 0.435546875, "grad_norm_var": 0.0029021581013997395, "learning_rate": 0.0001, "loss": 1.6961, "loss/crossentropy": 2.777703046798706, "loss/fcd": 1.4921875, "loss/idx": 8.0, "loss/logits": 0.20393580943346024, "step": 2103 }, { "epoch": 0.03141728697392097, "grad_norm": 0.498046875, "grad_norm_var": 0.0030120213826497397, "learning_rate": 0.0001, "loss": 1.8301, "loss/crossentropy": 2.976956844329834, "loss/fcd": 1.58203125, "loss/idx": 8.0, "loss/logits": 0.2480453997850418, "step": 2104 }, { "epoch": 0.03143221914453594, "grad_norm": 0.404296875, "grad_norm_var": 0.003154500325520833, "learning_rate": 0.0001, "loss": 1.5582, "loss/crossentropy": 2.641094446182251, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.18317919969558716, "step": 2105 }, { "epoch": 0.031447151315150926, "grad_norm": 0.4375, "grad_norm_var": 0.0029134114583333334, "learning_rate": 0.0001, "loss": 1.5931, "loss/crossentropy": 2.619847297668457, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.17907262593507767, "step": 2106 }, { "epoch": 0.03146208348576591, "grad_norm": 0.43359375, "grad_norm_var": 0.0028606255849202475, "learning_rate": 0.0001, "loss": 1.7044, "loss/crossentropy": 2.535176396369934, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.2004462108016014, "step": 2107 }, { "epoch": 0.03147701565638089, "grad_norm": 0.51953125, "grad_norm_var": 0.0031560262044270835, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.4883205890655518, "loss/fcd": 1.46484375, "loss/idx": 8.0, "loss/logits": 0.21974647045135498, "step": 2108 }, { "epoch": 0.031491947826995874, "grad_norm": 0.466796875, "grad_norm_var": 0.0028017520904541015, "learning_rate": 0.0001, "loss": 1.8459, "loss/crossentropy": 2.544515013694763, "loss/fcd": 1.6015625, "loss/idx": 8.0, "loss/logits": 0.2443794086575508, "step": 2109 }, { "epoch": 0.03150687999761085, "grad_norm": 0.408203125, "grad_norm_var": 0.002918243408203125, "learning_rate": 0.0001, "loss": 1.6459, "loss/crossentropy": 2.576838493347168, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.20058216154575348, "step": 2110 }, { "epoch": 0.03152181216822583, "grad_norm": 0.38671875, "grad_norm_var": 0.0031427383422851563, "learning_rate": 0.0001, "loss": 1.5009, "loss/crossentropy": 2.6612937450408936, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.1727575585246086, "step": 2111 }, { "epoch": 0.031536744338840815, "grad_norm": 0.4296875, "grad_norm_var": 0.0016751607259114583, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.420051693916321, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.18690764904022217, "step": 2112 }, { "epoch": 0.0315516765094558, "grad_norm": 0.4140625, "grad_norm_var": 0.0017038822174072266, "learning_rate": 0.0001, "loss": 1.4809, "loss/crossentropy": 2.7675206661224365, "loss/fcd": 1.31640625, "loss/idx": 8.0, "loss/logits": 0.16452692449092865, "step": 2113 }, { "epoch": 0.03156660868007078, "grad_norm": 0.50390625, "grad_norm_var": 0.0019327640533447266, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.7690247297286987, "loss/fcd": 1.5234375, "loss/idx": 8.0, "loss/logits": 0.19851567596197128, "step": 2114 }, { "epoch": 0.03158154085068576, "grad_norm": 0.89453125, "grad_norm_var": 0.01475372314453125, "learning_rate": 0.0001, "loss": 1.7978, "loss/crossentropy": 2.332781672477722, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.21183109283447266, "step": 2115 }, { "epoch": 0.03159647302130074, "grad_norm": 0.419921875, "grad_norm_var": 0.014432716369628906, "learning_rate": 0.0001, "loss": 1.6409, "loss/crossentropy": 2.3394250869750977, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.1955602690577507, "step": 2116 }, { "epoch": 0.03161140519191572, "grad_norm": 0.6171875, "grad_norm_var": 0.01528008778889974, "learning_rate": 0.0001, "loss": 1.6938, "loss/crossentropy": 2.66051983833313, "loss/fcd": 1.4921875, "loss/idx": 8.0, "loss/logits": 0.2015744373202324, "step": 2117 }, { "epoch": 0.031626337362530704, "grad_norm": 0.578125, "grad_norm_var": 0.015811602274576824, "learning_rate": 0.0001, "loss": 1.6875, "loss/crossentropy": 2.56989061832428, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.19926504790782928, "step": 2118 }, { "epoch": 0.03164126953314569, "grad_norm": 0.458984375, "grad_norm_var": 0.015674273173014324, "learning_rate": 0.0001, "loss": 1.6327, "loss/crossentropy": 2.2720203399658203, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.18734042346477509, "step": 2119 }, { "epoch": 0.03165620170376067, "grad_norm": 0.4375, "grad_norm_var": 0.0158541202545166, "learning_rate": 0.0001, "loss": 1.6693, "loss/crossentropy": 2.462122082710266, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.2083945944905281, "step": 2120 }, { "epoch": 0.031671133874375645, "grad_norm": 0.4765625, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 1.786, "loss/crossentropy": 2.5700005292892456, "loss/fcd": 1.53515625, "loss/idx": 8.0, "loss/logits": 0.25080136209726334, "step": 2121 }, { "epoch": 0.03168606604499063, "grad_norm": 0.62109375, "grad_norm_var": 0.01612847646077474, "learning_rate": 0.0001, "loss": 1.8047, "loss/crossentropy": 2.717989683151245, "loss/fcd": 1.56640625, "loss/idx": 8.0, "loss/logits": 0.23825186491012573, "step": 2122 }, { "epoch": 0.03170099821560561, "grad_norm": 0.439453125, "grad_norm_var": 0.016075499852498374, "learning_rate": 0.0001, "loss": 1.6256, "loss/crossentropy": 2.71493399143219, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.20763880014419556, "step": 2123 }, { "epoch": 0.031715930386220594, "grad_norm": 0.53125, "grad_norm_var": 0.016107543309529623, "learning_rate": 0.0001, "loss": 1.7208, "loss/crossentropy": 2.837113618850708, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.22079221159219742, "step": 2124 }, { "epoch": 0.031730862556835576, "grad_norm": 0.435546875, "grad_norm_var": 0.016328795750935873, "learning_rate": 0.0001, "loss": 1.6488, "loss/crossentropy": 2.549302101135254, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.1956993266940117, "step": 2125 }, { "epoch": 0.03174579472745056, "grad_norm": 0.404296875, "grad_norm_var": 0.016379276911417644, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.8148945569992065, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.21253588795661926, "step": 2126 }, { "epoch": 0.031760726898065535, "grad_norm": 0.375, "grad_norm_var": 0.01656963030497233, "learning_rate": 0.0001, "loss": 1.6413, "loss/crossentropy": 2.5911394357681274, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.2194361537694931, "step": 2127 }, { "epoch": 0.03177565906868052, "grad_norm": 0.380859375, "grad_norm_var": 0.017191505432128905, "learning_rate": 0.0001, "loss": 1.751, "loss/crossentropy": 2.415476083755493, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.24706026166677475, "step": 2128 }, { "epoch": 0.0317905912392955, "grad_norm": 0.423828125, "grad_norm_var": 0.017086521784464518, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.501340627670288, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.17138050496578217, "step": 2129 }, { "epoch": 0.03180552340991048, "grad_norm": 0.453125, "grad_norm_var": 0.017220417658487957, "learning_rate": 0.0001, "loss": 1.7605, "loss/crossentropy": 2.572074055671692, "loss/fcd": 1.53515625, "loss/idx": 8.0, "loss/logits": 0.22529959678649902, "step": 2130 }, { "epoch": 0.031820455580525465, "grad_norm": 0.40625, "grad_norm_var": 0.006221373875935872, "learning_rate": 0.0001, "loss": 1.6826, "loss/crossentropy": 2.5933796167373657, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20995663106441498, "step": 2131 }, { "epoch": 0.03183538775114044, "grad_norm": 0.466796875, "grad_norm_var": 0.006069548924763997, "learning_rate": 0.0001, "loss": 1.7738, "loss/crossentropy": 2.382189631462097, "loss/fcd": 1.546875, "loss/idx": 8.0, "loss/logits": 0.22692985087633133, "step": 2132 }, { "epoch": 0.031850319921755424, "grad_norm": 0.416015625, "grad_norm_var": 0.004627227783203125, "learning_rate": 0.0001, "loss": 1.5846, "loss/crossentropy": 2.6335387229919434, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19791582971811295, "step": 2133 }, { "epoch": 0.03186525209237041, "grad_norm": 0.38671875, "grad_norm_var": 0.0038141250610351563, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.525994300842285, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.17492203414440155, "step": 2134 }, { "epoch": 0.03188018426298539, "grad_norm": 0.3984375, "grad_norm_var": 0.003926960627237955, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.39855694770813, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.17706363648176193, "step": 2135 }, { "epoch": 0.03189511643360037, "grad_norm": 0.546875, "grad_norm_var": 0.004626576105753581, "learning_rate": 0.0001, "loss": 1.8717, "loss/crossentropy": 2.543327569961548, "loss/fcd": 1.609375, "loss/idx": 8.0, "loss/logits": 0.2623720169067383, "step": 2136 }, { "epoch": 0.031910048604215355, "grad_norm": 0.419921875, "grad_norm_var": 0.0046085993448893225, "learning_rate": 0.0001, "loss": 1.6589, "loss/crossentropy": 2.614794373512268, "loss/fcd": 1.46484375, "loss/idx": 8.0, "loss/logits": 0.1940615028142929, "step": 2137 }, { "epoch": 0.03192498077483033, "grad_norm": 0.4140625, "grad_norm_var": 0.0024014790852864582, "learning_rate": 0.0001, "loss": 1.5278, "loss/crossentropy": 2.6671642065048218, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.18408206850290298, "step": 2138 }, { "epoch": 0.03193991294544531, "grad_norm": 0.515625, "grad_norm_var": 0.0028484185536702474, "learning_rate": 0.0001, "loss": 1.7855, "loss/crossentropy": 2.6619362831115723, "loss/fcd": 1.55078125, "loss/idx": 8.0, "loss/logits": 0.23473163694143295, "step": 2139 }, { "epoch": 0.031954845116060296, "grad_norm": 0.43359375, "grad_norm_var": 0.0022030989329020183, "learning_rate": 0.0001, "loss": 1.69, "loss/crossentropy": 2.6156463623046875, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.21340154856443405, "step": 2140 }, { "epoch": 0.03196977728667528, "grad_norm": 0.50390625, "grad_norm_var": 0.002547454833984375, "learning_rate": 0.0001, "loss": 1.903, "loss/crossentropy": 2.7787933349609375, "loss/fcd": 1.64453125, "loss/idx": 8.0, "loss/logits": 0.2585112303495407, "step": 2141 }, { "epoch": 0.03198470945729026, "grad_norm": 0.427734375, "grad_norm_var": 0.00248870849609375, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.6235398054122925, "loss/fcd": 1.26171875, "loss/idx": 8.0, "loss/logits": 0.15354054421186447, "step": 2142 }, { "epoch": 0.03199964162790524, "grad_norm": 0.41796875, "grad_norm_var": 0.002257219950358073, "learning_rate": 0.0001, "loss": 1.6726, "loss/crossentropy": 2.4736419916152954, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.22727616131305695, "step": 2143 }, { "epoch": 0.03201457379852022, "grad_norm": 0.44921875, "grad_norm_var": 0.00202635129292806, "learning_rate": 0.0001, "loss": 1.6829, "loss/crossentropy": 2.4605672359466553, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.21023346483707428, "step": 2144 }, { "epoch": 0.0320295059691352, "grad_norm": 0.404296875, "grad_norm_var": 0.002098830540974935, "learning_rate": 0.0001, "loss": 1.5409, "loss/crossentropy": 2.487582802772522, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.16980616748332977, "step": 2145 }, { "epoch": 0.032044438139750185, "grad_norm": 0.451171875, "grad_norm_var": 0.0020959854125976564, "learning_rate": 0.0001, "loss": 1.6167, "loss/crossentropy": 2.7920517921447754, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1948300078511238, "step": 2146 }, { "epoch": 0.03205937031036517, "grad_norm": 0.478515625, "grad_norm_var": 0.0020859877268473307, "learning_rate": 0.0001, "loss": 1.7866, "loss/crossentropy": 2.598605751991272, "loss/fcd": 1.5390625, "loss/idx": 8.0, "loss/logits": 0.24758008867502213, "step": 2147 }, { "epoch": 0.03207430248098015, "grad_norm": 0.44140625, "grad_norm_var": 0.002054786682128906, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.7116726636886597, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.18552692234516144, "step": 2148 }, { "epoch": 0.032089234651595126, "grad_norm": 0.431640625, "grad_norm_var": 0.002011553446451823, "learning_rate": 0.0001, "loss": 1.6369, "loss/crossentropy": 2.658483147621155, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.19944186508655548, "step": 2149 }, { "epoch": 0.03210416682221011, "grad_norm": 0.37890625, "grad_norm_var": 0.0020761489868164062, "learning_rate": 0.0001, "loss": 1.5265, "loss/crossentropy": 2.505324602127075, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.18275200575590134, "step": 2150 }, { "epoch": 0.03211909899282509, "grad_norm": 0.361328125, "grad_norm_var": 0.0023905277252197266, "learning_rate": 0.0001, "loss": 1.6025, "loss/crossentropy": 2.391192674636841, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.1962529793381691, "step": 2151 }, { "epoch": 0.032134031163440074, "grad_norm": 0.5078125, "grad_norm_var": 0.0019410292307535807, "learning_rate": 0.0001, "loss": 1.6435, "loss/crossentropy": 2.6363203525543213, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.19036279618740082, "step": 2152 }, { "epoch": 0.03214896333405506, "grad_norm": 0.3984375, "grad_norm_var": 0.0020268758138020835, "learning_rate": 0.0001, "loss": 1.5014, "loss/crossentropy": 2.5317020416259766, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.17322948575019836, "step": 2153 }, { "epoch": 0.03216389550467004, "grad_norm": 0.421875, "grad_norm_var": 0.0020052591959635415, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.636947512626648, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.18739458918571472, "step": 2154 }, { "epoch": 0.032178827675285016, "grad_norm": 0.400390625, "grad_norm_var": 0.0016573429107666015, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.754707455635071, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.16504278779029846, "step": 2155 }, { "epoch": 0.0321937598459, "grad_norm": 0.484375, "grad_norm_var": 0.001830911636352539, "learning_rate": 0.0001, "loss": 1.6567, "loss/crossentropy": 2.3877068758010864, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.18407829850912094, "step": 2156 }, { "epoch": 0.03220869201651498, "grad_norm": 0.423828125, "grad_norm_var": 0.0014952977498372396, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.561208724975586, "loss/fcd": 1.29296875, "loss/idx": 8.0, "loss/logits": 0.16951829195022583, "step": 2157 }, { "epoch": 0.032223624187129964, "grad_norm": 0.64453125, "grad_norm_var": 0.004369338353474935, "learning_rate": 0.0001, "loss": 1.8887, "loss/crossentropy": 2.755163550376892, "loss/fcd": 1.66015625, "loss/idx": 8.0, "loss/logits": 0.22858187556266785, "step": 2158 }, { "epoch": 0.032238556357744946, "grad_norm": 0.462890625, "grad_norm_var": 0.0043426513671875, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.61116886138916, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.19589432328939438, "step": 2159 }, { "epoch": 0.03225348852835992, "grad_norm": 0.49609375, "grad_norm_var": 0.004498291015625, "learning_rate": 0.0001, "loss": 1.772, "loss/crossentropy": 2.4020588397979736, "loss/fcd": 1.5546875, "loss/idx": 8.0, "loss/logits": 0.21733030676841736, "step": 2160 }, { "epoch": 0.032268420698974905, "grad_norm": 0.478515625, "grad_norm_var": 0.004398028055826823, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.3794524669647217, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.1816871240735054, "step": 2161 }, { "epoch": 0.03228335286958989, "grad_norm": 0.3828125, "grad_norm_var": 0.004714568456013997, "learning_rate": 0.0001, "loss": 1.4818, "loss/crossentropy": 2.6590569019317627, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.1693245992064476, "step": 2162 }, { "epoch": 0.03229828504020487, "grad_norm": 0.439453125, "grad_norm_var": 0.004659255345662435, "learning_rate": 0.0001, "loss": 1.5862, "loss/crossentropy": 2.6106516122817993, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19950973987579346, "step": 2163 }, { "epoch": 0.03231321721081985, "grad_norm": 0.5078125, "grad_norm_var": 0.004884068171183268, "learning_rate": 0.0001, "loss": 1.7863, "loss/crossentropy": 2.7897473573684692, "loss/fcd": 1.54296875, "loss/idx": 8.0, "loss/logits": 0.24332761764526367, "step": 2164 }, { "epoch": 0.032328149381434836, "grad_norm": 0.451171875, "grad_norm_var": 0.004856729507446289, "learning_rate": 0.0001, "loss": 1.6491, "loss/crossentropy": 2.5986064672470093, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.21547991037368774, "step": 2165 }, { "epoch": 0.03234308155204981, "grad_norm": 0.4453125, "grad_norm_var": 0.004480600357055664, "learning_rate": 0.0001, "loss": 1.7188, "loss/crossentropy": 2.555765151977539, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.21494029462337494, "step": 2166 }, { "epoch": 0.032358013722664794, "grad_norm": 0.455078125, "grad_norm_var": 0.003838205337524414, "learning_rate": 0.0001, "loss": 1.5076, "loss/crossentropy": 2.633586049079895, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.17944791167974472, "step": 2167 }, { "epoch": 0.03237294589327978, "grad_norm": 0.37890625, "grad_norm_var": 0.004098367691040039, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.4690948724746704, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.1830184906721115, "step": 2168 }, { "epoch": 0.03238787806389476, "grad_norm": 0.39453125, "grad_norm_var": 0.004128503799438477, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.4733498096466064, "loss/fcd": 1.30078125, "loss/idx": 8.0, "loss/logits": 0.1571984589099884, "step": 2169 }, { "epoch": 0.03240281023450974, "grad_norm": 0.416015625, "grad_norm_var": 0.004155921936035156, "learning_rate": 0.0001, "loss": 1.6743, "loss/crossentropy": 2.3886845111846924, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.19382460415363312, "step": 2170 }, { "epoch": 0.03241774240512472, "grad_norm": 0.408203125, "grad_norm_var": 0.004104042053222656, "learning_rate": 0.0001, "loss": 1.4675, "loss/crossentropy": 2.541450023651123, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.17844083905220032, "step": 2171 }, { "epoch": 0.0324326745757397, "grad_norm": 0.392578125, "grad_norm_var": 0.004263162612915039, "learning_rate": 0.0001, "loss": 1.5884, "loss/crossentropy": 2.322944402694702, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.18993265181779861, "step": 2172 }, { "epoch": 0.03244760674635468, "grad_norm": 0.4765625, "grad_norm_var": 0.004262733459472656, "learning_rate": 0.0001, "loss": 1.7641, "loss/crossentropy": 2.662707209587097, "loss/fcd": 1.53125, "loss/idx": 8.0, "loss/logits": 0.23289669305086136, "step": 2173 }, { "epoch": 0.032462538916969666, "grad_norm": 0.439453125, "grad_norm_var": 0.001624155044555664, "learning_rate": 0.0001, "loss": 1.6811, "loss/crossentropy": 2.5959266424179077, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.22017040103673935, "step": 2174 }, { "epoch": 0.03247747108758465, "grad_norm": 0.4296875, "grad_norm_var": 0.001587677001953125, "learning_rate": 0.0001, "loss": 1.6884, "loss/crossentropy": 2.4609930515289307, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.21571926772594452, "step": 2175 }, { "epoch": 0.03249240325819963, "grad_norm": 0.396484375, "grad_norm_var": 0.0014231204986572266, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.5765715837478638, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18747826665639877, "step": 2176 }, { "epoch": 0.03250733542881461, "grad_norm": 0.357421875, "grad_norm_var": 0.0015689690907796225, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.52343213558197, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.1682533249258995, "step": 2177 }, { "epoch": 0.03252226759942959, "grad_norm": 0.51953125, "grad_norm_var": 0.0020006656646728515, "learning_rate": 0.0001, "loss": 1.958, "loss/crossentropy": 2.307738184928894, "loss/fcd": 1.70703125, "loss/idx": 8.0, "loss/logits": 0.25100916624069214, "step": 2178 }, { "epoch": 0.03253719977004457, "grad_norm": 0.43359375, "grad_norm_var": 0.0019968032836914064, "learning_rate": 0.0001, "loss": 1.6636, "loss/crossentropy": 2.4749315977096558, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.21441347897052765, "step": 2179 }, { "epoch": 0.032552131940659555, "grad_norm": 0.458984375, "grad_norm_var": 0.0016483147939046225, "learning_rate": 0.0001, "loss": 1.7557, "loss/crossentropy": 2.4802643060684204, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.24397443234920502, "step": 2180 }, { "epoch": 0.03256706411127454, "grad_norm": 0.55078125, "grad_norm_var": 0.0025716145833333333, "learning_rate": 0.0001, "loss": 2.0214, "loss/crossentropy": 2.6713634729385376, "loss/fcd": 1.74609375, "loss/idx": 8.0, "loss/logits": 0.27528825402259827, "step": 2181 }, { "epoch": 0.032581996281889514, "grad_norm": 0.40234375, "grad_norm_var": 0.0026254653930664062, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5994588136672974, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.16977737843990326, "step": 2182 }, { "epoch": 0.032596928452504496, "grad_norm": 0.478515625, "grad_norm_var": 0.0027322769165039062, "learning_rate": 0.0001, "loss": 1.6439, "loss/crossentropy": 2.9077088832855225, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.20641572773456573, "step": 2183 }, { "epoch": 0.03261186062311948, "grad_norm": 0.421875, "grad_norm_var": 0.0025357564290364584, "learning_rate": 0.0001, "loss": 1.6988, "loss/crossentropy": 2.7392778396606445, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.21833113580942154, "step": 2184 }, { "epoch": 0.03262679279373446, "grad_norm": 0.453125, "grad_norm_var": 0.0024260838826497396, "learning_rate": 0.0001, "loss": 1.5856, "loss/crossentropy": 2.5244054794311523, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.18716184049844742, "step": 2185 }, { "epoch": 0.032641724964349444, "grad_norm": 0.412109375, "grad_norm_var": 0.0024393717447916668, "learning_rate": 0.0001, "loss": 1.4619, "loss/crossentropy": 2.6568782329559326, "loss/fcd": 1.29296875, "loss/idx": 8.0, "loss/logits": 0.16897783428430557, "step": 2186 }, { "epoch": 0.03265665713496443, "grad_norm": 0.484375, "grad_norm_var": 0.002484623591105143, "learning_rate": 0.0001, "loss": 1.7092, "loss/crossentropy": 2.5634366273880005, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.20922350883483887, "step": 2187 }, { "epoch": 0.0326715893055794, "grad_norm": 0.4140625, "grad_norm_var": 0.002365557352701823, "learning_rate": 0.0001, "loss": 1.6341, "loss/crossentropy": 2.5642699003219604, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.19656551629304886, "step": 2188 }, { "epoch": 0.032686521476194386, "grad_norm": 0.44140625, "grad_norm_var": 0.0022974650065104166, "learning_rate": 0.0001, "loss": 1.746, "loss/crossentropy": 2.6615079641342163, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.23429805785417557, "step": 2189 }, { "epoch": 0.03270145364680937, "grad_norm": 0.484375, "grad_norm_var": 0.00240019162495931, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.745781421661377, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.19230937212705612, "step": 2190 }, { "epoch": 0.03271638581742435, "grad_norm": 0.458984375, "grad_norm_var": 0.0023894627888997396, "learning_rate": 0.0001, "loss": 1.7687, "loss/crossentropy": 2.8186583518981934, "loss/fcd": 1.53515625, "loss/idx": 8.0, "loss/logits": 0.23349624127149582, "step": 2191 }, { "epoch": 0.032731317988039334, "grad_norm": 0.40625, "grad_norm_var": 0.002328348159790039, "learning_rate": 0.0001, "loss": 1.5471, "loss/crossentropy": 2.8029152154922485, "loss/fcd": 1.35546875, "loss/idx": 8.0, "loss/logits": 0.19165025651454926, "step": 2192 }, { "epoch": 0.03274625015865431, "grad_norm": 0.443359375, "grad_norm_var": 0.0017450809478759765, "learning_rate": 0.0001, "loss": 1.6388, "loss/crossentropy": 2.6153008937835693, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.20524615049362183, "step": 2193 }, { "epoch": 0.03276118232926929, "grad_norm": 0.4921875, "grad_norm_var": 0.0015528202056884766, "learning_rate": 0.0001, "loss": 1.5758, "loss/crossentropy": 2.551974296569824, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.1851801797747612, "step": 2194 }, { "epoch": 0.032776114499884275, "grad_norm": 0.412109375, "grad_norm_var": 0.0016351699829101562, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.6952152252197266, "loss/fcd": 1.41015625, "loss/idx": 8.0, "loss/logits": 0.19649723917245865, "step": 2195 }, { "epoch": 0.03279104667049926, "grad_norm": 0.41015625, "grad_norm_var": 0.001731729507446289, "learning_rate": 0.0001, "loss": 1.702, "loss/crossentropy": 2.7260611057281494, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.2215757519006729, "step": 2196 }, { "epoch": 0.03280597884111424, "grad_norm": 0.40234375, "grad_norm_var": 0.0010721683502197266, "learning_rate": 0.0001, "loss": 1.6401, "loss/crossentropy": 2.559051752090454, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.2025933414697647, "step": 2197 }, { "epoch": 0.03282091101172922, "grad_norm": 0.455078125, "grad_norm_var": 0.000991058349609375, "learning_rate": 0.0001, "loss": 1.6651, "loss/crossentropy": 2.6858179569244385, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.21199822425842285, "step": 2198 }, { "epoch": 0.0328358431823442, "grad_norm": 0.494140625, "grad_norm_var": 0.001082611083984375, "learning_rate": 0.0001, "loss": 1.8886, "loss/crossentropy": 2.558376669883728, "loss/fcd": 1.64453125, "loss/idx": 8.0, "loss/logits": 0.2440936341881752, "step": 2199 }, { "epoch": 0.03285077535295918, "grad_norm": 0.376953125, "grad_norm_var": 0.0013344923655192057, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.511059880256653, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.18552984297275543, "step": 2200 }, { "epoch": 0.032865707523574164, "grad_norm": 0.55078125, "grad_norm_var": 0.0021006107330322266, "learning_rate": 0.0001, "loss": 1.6234, "loss/crossentropy": 2.6318275928497314, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.19372473657131195, "step": 2201 }, { "epoch": 0.03288063969418915, "grad_norm": 0.46875, "grad_norm_var": 0.002043914794921875, "learning_rate": 0.0001, "loss": 1.6236, "loss/crossentropy": 2.5065059661865234, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.20172550529241562, "step": 2202 }, { "epoch": 0.03289557186480413, "grad_norm": 0.388671875, "grad_norm_var": 0.0021739800771077475, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.653670907020569, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.15603645145893097, "step": 2203 }, { "epoch": 0.032910504035419105, "grad_norm": 0.435546875, "grad_norm_var": 0.0021178563435872394, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.695380926132202, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.1994139701128006, "step": 2204 }, { "epoch": 0.03292543620603409, "grad_norm": 0.3984375, "grad_norm_var": 0.002254231770833333, "learning_rate": 0.0001, "loss": 1.4826, "loss/crossentropy": 2.7156689167022705, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.17005165666341782, "step": 2205 }, { "epoch": 0.03294036837664907, "grad_norm": 0.498046875, "grad_norm_var": 0.0023424625396728516, "learning_rate": 0.0001, "loss": 1.5419, "loss/crossentropy": 2.29854154586792, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.16300424188375473, "step": 2206 }, { "epoch": 0.03295530054726405, "grad_norm": 0.43359375, "grad_norm_var": 0.002329444885253906, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.737528920173645, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.1936636045575142, "step": 2207 }, { "epoch": 0.032970232717879036, "grad_norm": 0.412109375, "grad_norm_var": 0.002303934097290039, "learning_rate": 0.0001, "loss": 1.6804, "loss/crossentropy": 2.5732948780059814, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.22341081500053406, "step": 2208 }, { "epoch": 0.03298516488849402, "grad_norm": 0.4140625, "grad_norm_var": 0.002352333068847656, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.764334201812744, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.20774365961551666, "step": 2209 }, { "epoch": 0.033000097059108995, "grad_norm": 0.5625, "grad_norm_var": 0.0031488418579101564, "learning_rate": 0.0001, "loss": 1.6789, "loss/crossentropy": 2.5513195991516113, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.20235882699489594, "step": 2210 }, { "epoch": 0.03301502922972398, "grad_norm": 0.4921875, "grad_norm_var": 0.003202931086222331, "learning_rate": 0.0001, "loss": 1.7534, "loss/crossentropy": 2.7163147926330566, "loss/fcd": 1.53125, "loss/idx": 8.0, "loss/logits": 0.2221963107585907, "step": 2211 }, { "epoch": 0.03302996140033896, "grad_norm": 0.5, "grad_norm_var": 0.0032351016998291016, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.765359878540039, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.16806495189666748, "step": 2212 }, { "epoch": 0.03304489357095394, "grad_norm": 0.62890625, "grad_norm_var": 0.004846556981404623, "learning_rate": 0.0001, "loss": 1.8766, "loss/crossentropy": 2.911900520324707, "loss/fcd": 1.64453125, "loss/idx": 8.0, "loss/logits": 0.23207394778728485, "step": 2213 }, { "epoch": 0.033059825741568925, "grad_norm": 0.43359375, "grad_norm_var": 0.0049163182576497395, "learning_rate": 0.0001, "loss": 1.6089, "loss/crossentropy": 2.5677011013031006, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.1908886954188347, "step": 2214 }, { "epoch": 0.03307475791218391, "grad_norm": 0.392578125, "grad_norm_var": 0.005207252502441406, "learning_rate": 0.0001, "loss": 1.6442, "loss/crossentropy": 2.369963765144348, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.19494222104549408, "step": 2215 }, { "epoch": 0.033089690082798884, "grad_norm": 0.4296875, "grad_norm_var": 0.004785394668579102, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.5547975301742554, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.1687789484858513, "step": 2216 }, { "epoch": 0.033104622253413866, "grad_norm": 0.373046875, "grad_norm_var": 0.004726092020670573, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.495131254196167, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.18215063214302063, "step": 2217 }, { "epoch": 0.03311955442402885, "grad_norm": 0.404296875, "grad_norm_var": 0.004857746760050455, "learning_rate": 0.0001, "loss": 1.5646, "loss/crossentropy": 2.565692901611328, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.18177250027656555, "step": 2218 }, { "epoch": 0.03313448659464383, "grad_norm": 0.5390625, "grad_norm_var": 0.005045000712076823, "learning_rate": 0.0001, "loss": 1.8853, "loss/crossentropy": 2.7909586429595947, "loss/fcd": 1.62109375, "loss/idx": 8.0, "loss/logits": 0.26420940458774567, "step": 2219 }, { "epoch": 0.033149418765258815, "grad_norm": 0.427734375, "grad_norm_var": 0.00507348378499349, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.6664270162582397, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.16902238130569458, "step": 2220 }, { "epoch": 0.03316435093587379, "grad_norm": 0.400390625, "grad_norm_var": 0.005058018366495768, "learning_rate": 0.0001, "loss": 1.6133, "loss/crossentropy": 2.6542571783065796, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.2070382833480835, "step": 2221 }, { "epoch": 0.03317928310648877, "grad_norm": 0.443359375, "grad_norm_var": 0.004959217707316081, "learning_rate": 0.0001, "loss": 1.5783, "loss/crossentropy": 2.4304349422454834, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.1915796995162964, "step": 2222 }, { "epoch": 0.033194215277103756, "grad_norm": 0.423828125, "grad_norm_var": 0.004993629455566406, "learning_rate": 0.0001, "loss": 1.5138, "loss/crossentropy": 2.53697407245636, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.18172089755535126, "step": 2223 }, { "epoch": 0.03320914744771874, "grad_norm": 0.390625, "grad_norm_var": 0.005144866307576498, "learning_rate": 0.0001, "loss": 1.7021, "loss/crossentropy": 2.5511213541030884, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.2255162075161934, "step": 2224 }, { "epoch": 0.03322407961833372, "grad_norm": 0.38671875, "grad_norm_var": 0.0053353468577067055, "learning_rate": 0.0001, "loss": 1.486, "loss/crossentropy": 2.655084252357483, "loss/fcd": 1.30859375, "loss/idx": 8.0, "loss/logits": 0.1774442195892334, "step": 2225 }, { "epoch": 0.033239011788948704, "grad_norm": 0.474609375, "grad_norm_var": 0.0045206705729166664, "learning_rate": 0.0001, "loss": 1.7114, "loss/crossentropy": 2.538921356201172, "loss/fcd": 1.49609375, "loss/idx": 8.0, "loss/logits": 0.21534917503595352, "step": 2226 }, { "epoch": 0.03325394395956368, "grad_norm": 0.48046875, "grad_norm_var": 0.004457537333170573, "learning_rate": 0.0001, "loss": 1.6528, "loss/crossentropy": 2.420040726661682, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.18401682376861572, "step": 2227 }, { "epoch": 0.03326887613017866, "grad_norm": 0.408203125, "grad_norm_var": 0.004317839940388997, "learning_rate": 0.0001, "loss": 1.4697, "loss/crossentropy": 2.6357897520065308, "loss/fcd": 1.30859375, "loss/idx": 8.0, "loss/logits": 0.16109557449817657, "step": 2228 }, { "epoch": 0.033283808300793645, "grad_norm": 0.4140625, "grad_norm_var": 0.0017861525217692058, "learning_rate": 0.0001, "loss": 1.5827, "loss/crossentropy": 2.613793134689331, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.1882176250219345, "step": 2229 }, { "epoch": 0.03329874047140863, "grad_norm": 0.474609375, "grad_norm_var": 0.001930681864420573, "learning_rate": 0.0001, "loss": 1.9712, "loss/crossentropy": 2.4130213260650635, "loss/fcd": 1.6875, "loss/idx": 8.0, "loss/logits": 0.283672571182251, "step": 2230 }, { "epoch": 0.03331367264202361, "grad_norm": 0.400390625, "grad_norm_var": 0.0018966039021809896, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.5864596366882324, "loss/fcd": 1.28515625, "loss/idx": 8.0, "loss/logits": 0.16884412616491318, "step": 2231 }, { "epoch": 0.033328604812638586, "grad_norm": 0.546875, "grad_norm_var": 0.0027587254842122394, "learning_rate": 0.0001, "loss": 1.8451, "loss/crossentropy": 2.5193954706192017, "loss/fcd": 1.6171875, "loss/idx": 8.0, "loss/logits": 0.22792227566242218, "step": 2232 }, { "epoch": 0.03334353698325357, "grad_norm": 0.53125, "grad_norm_var": 0.0029788812001546225, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.707845449447632, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.19365651905536652, "step": 2233 }, { "epoch": 0.03335846915386855, "grad_norm": 0.51171875, "grad_norm_var": 0.0030934015909830728, "learning_rate": 0.0001, "loss": 1.7563, "loss/crossentropy": 2.3636025190353394, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.24460841715335846, "step": 2234 }, { "epoch": 0.033373401324483534, "grad_norm": 0.427734375, "grad_norm_var": 0.0025960127512613933, "learning_rate": 0.0001, "loss": 1.5775, "loss/crossentropy": 2.697512626647949, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.19470475614070892, "step": 2235 }, { "epoch": 0.03338833349509852, "grad_norm": 0.431640625, "grad_norm_var": 0.0025872389475504556, "learning_rate": 0.0001, "loss": 1.6801, "loss/crossentropy": 2.5496588945388794, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.22307701408863068, "step": 2236 }, { "epoch": 0.0334032656657135, "grad_norm": 0.47265625, "grad_norm_var": 0.002467854817708333, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.8857460021972656, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.193416990339756, "step": 2237 }, { "epoch": 0.033418197836328475, "grad_norm": 0.37890625, "grad_norm_var": 0.0027946313222249348, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.553863286972046, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.19368629157543182, "step": 2238 }, { "epoch": 0.03343313000694346, "grad_norm": 0.51171875, "grad_norm_var": 0.0030042012532552082, "learning_rate": 0.0001, "loss": 1.6679, "loss/crossentropy": 2.9120118618011475, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.19919437915086746, "step": 2239 }, { "epoch": 0.03344806217755844, "grad_norm": 0.482421875, "grad_norm_var": 0.002771870295206706, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.5343586206436157, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.1970025897026062, "step": 2240 }, { "epoch": 0.033462994348173423, "grad_norm": 0.455078125, "grad_norm_var": 0.0024108250935872396, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.504353880882263, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.1903868392109871, "step": 2241 }, { "epoch": 0.033477926518788406, "grad_norm": 0.373046875, "grad_norm_var": 0.002893511454264323, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.750340223312378, "loss/fcd": 1.25390625, "loss/idx": 8.0, "loss/logits": 0.15855064988136292, "step": 2242 }, { "epoch": 0.03349285868940338, "grad_norm": 0.47265625, "grad_norm_var": 0.0028721491495768228, "learning_rate": 0.0001, "loss": 1.6019, "loss/crossentropy": 2.5036251544952393, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.1877887174487114, "step": 2243 }, { "epoch": 0.033507790860018365, "grad_norm": 0.474609375, "grad_norm_var": 0.0027262369791666665, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.2709195017814636, "loss/fcd": 1.29296875, "loss/idx": 8.0, "loss/logits": 0.14752307534217834, "step": 2244 }, { "epoch": 0.03352272303063335, "grad_norm": 0.4140625, "grad_norm_var": 0.0027262369791666665, "learning_rate": 0.0001, "loss": 1.5785, "loss/crossentropy": 2.586935520172119, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.1956535428762436, "step": 2245 }, { "epoch": 0.03353765520124833, "grad_norm": 0.431640625, "grad_norm_var": 0.0027577082316080728, "learning_rate": 0.0001, "loss": 1.6211, "loss/crossentropy": 2.6019307374954224, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1992647647857666, "step": 2246 }, { "epoch": 0.03355258737186331, "grad_norm": 0.435546875, "grad_norm_var": 0.0025683085123697916, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.639284610748291, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.19824903458356857, "step": 2247 }, { "epoch": 0.033567519542478295, "grad_norm": 0.435546875, "grad_norm_var": 0.0020455519358317056, "learning_rate": 0.0001, "loss": 1.5827, "loss/crossentropy": 2.701996326446533, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19599143415689468, "step": 2248 }, { "epoch": 0.03358245171309327, "grad_norm": 0.451171875, "grad_norm_var": 0.001605669657389323, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.667503237724304, "loss/fcd": 1.3203125, "loss/idx": 8.0, "loss/logits": 0.17158003896474838, "step": 2249 }, { "epoch": 0.033597383883708254, "grad_norm": 0.380859375, "grad_norm_var": 0.001555617650349935, "learning_rate": 0.0001, "loss": 1.4574, "loss/crossentropy": 2.6365907192230225, "loss/fcd": 1.296875, "loss/idx": 8.0, "loss/logits": 0.16057138890028, "step": 2250 }, { "epoch": 0.03361231605432324, "grad_norm": 0.486328125, "grad_norm_var": 0.0016795953114827475, "learning_rate": 0.0001, "loss": 1.6782, "loss/crossentropy": 2.8209983110427856, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.21728218346834183, "step": 2251 }, { "epoch": 0.03362724822493822, "grad_norm": 0.453125, "grad_norm_var": 0.0016759236653645833, "learning_rate": 0.0001, "loss": 1.8384, "loss/crossentropy": 2.4108022451400757, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.25244712829589844, "step": 2252 }, { "epoch": 0.0336421803955532, "grad_norm": 0.5078125, "grad_norm_var": 0.0018859227498372396, "learning_rate": 0.0001, "loss": 1.8121, "loss/crossentropy": 2.7076724767684937, "loss/fcd": 1.5625, "loss/idx": 8.0, "loss/logits": 0.24964862316846848, "step": 2253 }, { "epoch": 0.03365711256616818, "grad_norm": 0.39453125, "grad_norm_var": 0.0017602920532226562, "learning_rate": 0.0001, "loss": 1.5103, "loss/crossentropy": 2.5698360204696655, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.17437934130430222, "step": 2254 }, { "epoch": 0.03367204473678316, "grad_norm": 0.546875, "grad_norm_var": 0.002138519287109375, "learning_rate": 0.0001, "loss": 1.6216, "loss/crossentropy": 2.9092483520507812, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.20751577615737915, "step": 2255 }, { "epoch": 0.03368697690739814, "grad_norm": 0.41796875, "grad_norm_var": 0.002117013931274414, "learning_rate": 0.0001, "loss": 1.7201, "loss/crossentropy": 2.6528059244155884, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.23184439539909363, "step": 2256 }, { "epoch": 0.033701909078013126, "grad_norm": 0.427734375, "grad_norm_var": 0.0021294752756754556, "learning_rate": 0.0001, "loss": 1.7064, "loss/crossentropy": 2.6126315593719482, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.22593572735786438, "step": 2257 }, { "epoch": 0.03371684124862811, "grad_norm": 0.443359375, "grad_norm_var": 0.0017735640207926433, "learning_rate": 0.0001, "loss": 1.8606, "loss/crossentropy": 2.4679126739501953, "loss/fcd": 1.59375, "loss/idx": 8.0, "loss/logits": 0.2668297737836838, "step": 2258 }, { "epoch": 0.03373177341924309, "grad_norm": 0.423828125, "grad_norm_var": 0.0017644246419270834, "learning_rate": 0.0001, "loss": 1.529, "loss/crossentropy": 2.6388542652130127, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.18130510300397873, "step": 2259 }, { "epoch": 0.03374670558985807, "grad_norm": 0.53125, "grad_norm_var": 0.0021861871083577473, "learning_rate": 0.0001, "loss": 1.7246, "loss/crossentropy": 2.526273250579834, "loss/fcd": 1.51171875, "loss/idx": 8.0, "loss/logits": 0.21290218830108643, "step": 2260 }, { "epoch": 0.03376163776047305, "grad_norm": 0.423828125, "grad_norm_var": 0.0021468480428059895, "learning_rate": 0.0001, "loss": 1.5261, "loss/crossentropy": 2.478758692741394, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.1784854233264923, "step": 2261 }, { "epoch": 0.03377656993108803, "grad_norm": 0.46484375, "grad_norm_var": 0.002136850357055664, "learning_rate": 0.0001, "loss": 1.6596, "loss/crossentropy": 2.616526246070862, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.18692341446876526, "step": 2262 }, { "epoch": 0.033791502101703015, "grad_norm": 0.39453125, "grad_norm_var": 0.002329444885253906, "learning_rate": 0.0001, "loss": 1.4979, "loss/crossentropy": 2.7694281339645386, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.18539002537727356, "step": 2263 }, { "epoch": 0.033806434272318, "grad_norm": 0.447265625, "grad_norm_var": 0.002317047119140625, "learning_rate": 0.0001, "loss": 1.5627, "loss/crossentropy": 2.5573606491088867, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.18772995471954346, "step": 2264 }, { "epoch": 0.033821366442932974, "grad_norm": 0.39453125, "grad_norm_var": 0.0025064945220947266, "learning_rate": 0.0001, "loss": 1.6374, "loss/crossentropy": 2.5073299407958984, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.19599463790655136, "step": 2265 }, { "epoch": 0.033836298613547956, "grad_norm": 0.55078125, "grad_norm_var": 0.0028314590454101562, "learning_rate": 0.0001, "loss": 1.668, "loss/crossentropy": 2.656440854072571, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.2070898786187172, "step": 2266 }, { "epoch": 0.03385123078416294, "grad_norm": 0.828125, "grad_norm_var": 0.01147929827372233, "learning_rate": 0.0001, "loss": 1.8747, "loss/crossentropy": 2.813192367553711, "loss/fcd": 1.63671875, "loss/idx": 8.0, "loss/logits": 0.2379578799009323, "step": 2267 }, { "epoch": 0.03386616295477792, "grad_norm": 0.474609375, "grad_norm_var": 0.01143646240234375, "learning_rate": 0.0001, "loss": 1.751, "loss/crossentropy": 2.49735951423645, "loss/fcd": 1.52734375, "loss/idx": 8.0, "loss/logits": 0.2236904576420784, "step": 2268 }, { "epoch": 0.033881095125392904, "grad_norm": 0.408203125, "grad_norm_var": 0.01168045997619629, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.5952394008636475, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.17333002388477325, "step": 2269 }, { "epoch": 0.03389602729600789, "grad_norm": 0.447265625, "grad_norm_var": 0.0113006591796875, "learning_rate": 0.0001, "loss": 1.6339, "loss/crossentropy": 2.499724507331848, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.19244911521673203, "step": 2270 }, { "epoch": 0.03391095946662286, "grad_norm": 0.37890625, "grad_norm_var": 0.011489295959472656, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.735043168067932, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.18167711049318314, "step": 2271 }, { "epoch": 0.033925891637237846, "grad_norm": 0.45703125, "grad_norm_var": 0.01133416493733724, "learning_rate": 0.0001, "loss": 1.7099, "loss/crossentropy": 2.6848727464675903, "loss/fcd": 1.49609375, "loss/idx": 8.0, "loss/logits": 0.21383347362279892, "step": 2272 }, { "epoch": 0.03394082380785283, "grad_norm": 0.408203125, "grad_norm_var": 0.011464182535807292, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.750237226486206, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.20530135929584503, "step": 2273 }, { "epoch": 0.03395575597846781, "grad_norm": 0.53125, "grad_norm_var": 0.011666599909464519, "learning_rate": 0.0001, "loss": 1.8075, "loss/crossentropy": 2.2781327962875366, "loss/fcd": 1.58984375, "loss/idx": 8.0, "loss/logits": 0.2176109403371811, "step": 2274 }, { "epoch": 0.033970688149082794, "grad_norm": 0.36328125, "grad_norm_var": 0.012290891011555989, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.466241717338562, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.17077409476041794, "step": 2275 }, { "epoch": 0.033985620319697776, "grad_norm": 0.39453125, "grad_norm_var": 0.012324269612630208, "learning_rate": 0.0001, "loss": 1.5018, "loss/crossentropy": 2.593820333480835, "loss/fcd": 1.31640625, "loss/idx": 8.0, "loss/logits": 0.1853567585349083, "step": 2276 }, { "epoch": 0.03400055249031275, "grad_norm": 0.427734375, "grad_norm_var": 0.012306149800618489, "learning_rate": 0.0001, "loss": 1.6389, "loss/crossentropy": 2.5633667707443237, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.2169882208108902, "step": 2277 }, { "epoch": 0.034015484660927735, "grad_norm": 0.396484375, "grad_norm_var": 0.012560383478800455, "learning_rate": 0.0001, "loss": 1.6327, "loss/crossentropy": 2.595790386199951, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.20302347093820572, "step": 2278 }, { "epoch": 0.03403041683154272, "grad_norm": 0.41015625, "grad_norm_var": 0.012446705500284832, "learning_rate": 0.0001, "loss": 1.5457, "loss/crossentropy": 2.809088349342346, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.1824195384979248, "step": 2279 }, { "epoch": 0.0340453490021577, "grad_norm": 0.42578125, "grad_norm_var": 0.01250457763671875, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.659807801246643, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.18458574265241623, "step": 2280 }, { "epoch": 0.03406028117277268, "grad_norm": 0.470703125, "grad_norm_var": 0.012242364883422851, "learning_rate": 0.0001, "loss": 1.6732, "loss/crossentropy": 2.7676979303359985, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.22395016252994537, "step": 2281 }, { "epoch": 0.03407521334338766, "grad_norm": 0.4453125, "grad_norm_var": 0.011672449111938477, "learning_rate": 0.0001, "loss": 1.6477, "loss/crossentropy": 2.490418791770935, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.19852878898382187, "step": 2282 }, { "epoch": 0.03409014551400264, "grad_norm": 0.46875, "grad_norm_var": 0.0018282413482666015, "learning_rate": 0.0001, "loss": 1.6068, "loss/crossentropy": 2.5909727811813354, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.1887875497341156, "step": 2283 }, { "epoch": 0.034105077684617624, "grad_norm": 0.4921875, "grad_norm_var": 0.0019479751586914062, "learning_rate": 0.0001, "loss": 1.5888, "loss/crossentropy": 2.480063557624817, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.2020929679274559, "step": 2284 }, { "epoch": 0.03412000985523261, "grad_norm": 0.42578125, "grad_norm_var": 0.001909494400024414, "learning_rate": 0.0001, "loss": 1.6154, "loss/crossentropy": 2.698867440223694, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.2169634848833084, "step": 2285 }, { "epoch": 0.03413494202584759, "grad_norm": 0.478515625, "grad_norm_var": 0.0020259698232014973, "learning_rate": 0.0001, "loss": 1.67, "loss/crossentropy": 2.373765707015991, "loss/fcd": 1.4921875, "loss/idx": 8.0, "loss/logits": 0.17779844999313354, "step": 2286 }, { "epoch": 0.03414987419646257, "grad_norm": 0.435546875, "grad_norm_var": 0.00179595947265625, "learning_rate": 0.0001, "loss": 1.6296, "loss/crossentropy": 2.8165390491485596, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.19600094854831696, "step": 2287 }, { "epoch": 0.03416480636707755, "grad_norm": 0.498046875, "grad_norm_var": 0.001997232437133789, "learning_rate": 0.0001, "loss": 1.6376, "loss/crossentropy": 2.5923389196395874, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.21967966854572296, "step": 2288 }, { "epoch": 0.03417973853769253, "grad_norm": 0.478515625, "grad_norm_var": 0.0019892215728759765, "learning_rate": 0.0001, "loss": 1.6049, "loss/crossentropy": 2.805776357650757, "loss/fcd": 1.41015625, "loss/idx": 8.0, "loss/logits": 0.1947225108742714, "step": 2289 }, { "epoch": 0.03419467070830751, "grad_norm": 0.40234375, "grad_norm_var": 0.0015696048736572265, "learning_rate": 0.0001, "loss": 1.7192, "loss/crossentropy": 2.3996899127960205, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.21915637701749802, "step": 2290 }, { "epoch": 0.034209602878922496, "grad_norm": 0.484375, "grad_norm_var": 0.001273965835571289, "learning_rate": 0.0001, "loss": 1.8879, "loss/crossentropy": 2.5423721075057983, "loss/fcd": 1.61328125, "loss/idx": 8.0, "loss/logits": 0.27457770705223083, "step": 2291 }, { "epoch": 0.03422453504953748, "grad_norm": 0.49609375, "grad_norm_var": 0.0012227217356363931, "learning_rate": 0.0001, "loss": 1.7409, "loss/crossentropy": 2.3684213161468506, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.252609059214592, "step": 2292 }, { "epoch": 0.034239467220152454, "grad_norm": 0.4453125, "grad_norm_var": 0.0011845270792643229, "learning_rate": 0.0001, "loss": 1.5719, "loss/crossentropy": 2.628530502319336, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.18911800533533096, "step": 2293 }, { "epoch": 0.03425439939076744, "grad_norm": 0.435546875, "grad_norm_var": 0.0009836196899414063, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.714632272720337, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.19876396656036377, "step": 2294 }, { "epoch": 0.03426933156138242, "grad_norm": 0.427734375, "grad_norm_var": 0.0008959293365478516, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.739201068878174, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.17228715866804123, "step": 2295 }, { "epoch": 0.0342842637319974, "grad_norm": 0.4296875, "grad_norm_var": 0.0008806705474853516, "learning_rate": 0.0001, "loss": 1.6495, "loss/crossentropy": 2.7434141635894775, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.1963869035243988, "step": 2296 }, { "epoch": 0.034299195902612385, "grad_norm": 0.439453125, "grad_norm_var": 0.0008852481842041016, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.4648834466934204, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.18185754120349884, "step": 2297 }, { "epoch": 0.03431412807322737, "grad_norm": 0.54296875, "grad_norm_var": 0.001352548599243164, "learning_rate": 0.0001, "loss": 1.6597, "loss/crossentropy": 2.55813729763031, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.21047183126211166, "step": 2298 }, { "epoch": 0.034329060243842344, "grad_norm": 0.443359375, "grad_norm_var": 0.0013676325480143229, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.663989782333374, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.17883626371622086, "step": 2299 }, { "epoch": 0.034343992414457326, "grad_norm": 0.68359375, "grad_norm_var": 0.004486083984375, "learning_rate": 0.0001, "loss": 1.8591, "loss/crossentropy": 2.301128387451172, "loss/fcd": 1.59375, "loss/idx": 8.0, "loss/logits": 0.265314057469368, "step": 2300 }, { "epoch": 0.03435892458507231, "grad_norm": 0.4375, "grad_norm_var": 0.004422950744628906, "learning_rate": 0.0001, "loss": 1.6059, "loss/crossentropy": 2.5303125381469727, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.18399938941001892, "step": 2301 }, { "epoch": 0.03437385675568729, "grad_norm": 0.4140625, "grad_norm_var": 0.004630136489868164, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.707468628883362, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.18344046920537949, "step": 2302 }, { "epoch": 0.034388788926302274, "grad_norm": 0.416015625, "grad_norm_var": 0.00473949114481608, "learning_rate": 0.0001, "loss": 1.7438, "loss/crossentropy": 2.5503976345062256, "loss/fcd": 1.515625, "loss/idx": 8.0, "loss/logits": 0.22820547223091125, "step": 2303 }, { "epoch": 0.03440372109691725, "grad_norm": 0.4453125, "grad_norm_var": 0.0046961466471354164, "learning_rate": 0.0001, "loss": 1.5364, "loss/crossentropy": 2.5478179454803467, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.17698953300714493, "step": 2304 }, { "epoch": 0.03441865326753223, "grad_norm": 0.419921875, "grad_norm_var": 0.004796282450358073, "learning_rate": 0.0001, "loss": 1.584, "loss/crossentropy": 2.569201350212097, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.1972959041595459, "step": 2305 }, { "epoch": 0.034433585438147216, "grad_norm": 1.4921875, "grad_norm_var": 0.07062327067057292, "learning_rate": 0.0001, "loss": 2.1849, "loss/crossentropy": 2.397889018058777, "loss/fcd": 1.94921875, "loss/idx": 8.0, "loss/logits": 0.23567065596580505, "step": 2306 }, { "epoch": 0.0344485176087622, "grad_norm": 0.478515625, "grad_norm_var": 0.07065974871317546, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.6797566413879395, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.16356314718723297, "step": 2307 }, { "epoch": 0.03446344977937718, "grad_norm": 0.455078125, "grad_norm_var": 0.07093912760416667, "learning_rate": 0.0001, "loss": 1.5613, "loss/crossentropy": 2.575782895088196, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.1941295713186264, "step": 2308 }, { "epoch": 0.034478381949992164, "grad_norm": 0.361328125, "grad_norm_var": 0.07227667172749837, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.499812126159668, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.16612045466899872, "step": 2309 }, { "epoch": 0.03449331412060714, "grad_norm": 0.392578125, "grad_norm_var": 0.07287672360738119, "learning_rate": 0.0001, "loss": 1.5401, "loss/crossentropy": 2.6230974197387695, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.18852578103542328, "step": 2310 }, { "epoch": 0.03450824629122212, "grad_norm": 0.41796875, "grad_norm_var": 0.0729995091756185, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.520034670829773, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.19502707570791245, "step": 2311 }, { "epoch": 0.034523178461837105, "grad_norm": 0.443359375, "grad_norm_var": 0.07285230954488119, "learning_rate": 0.0001, "loss": 1.5848, "loss/crossentropy": 2.6227974891662598, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.18640032410621643, "step": 2312 }, { "epoch": 0.03453811063245209, "grad_norm": 0.494140625, "grad_norm_var": 0.07246867815653484, "learning_rate": 0.0001, "loss": 1.7266, "loss/crossentropy": 3.077919602394104, "loss/fcd": 1.515625, "loss/idx": 8.0, "loss/logits": 0.21095766872167587, "step": 2313 }, { "epoch": 0.03455304280306707, "grad_norm": 0.474609375, "grad_norm_var": 0.07256158192952473, "learning_rate": 0.0001, "loss": 1.6405, "loss/crossentropy": 2.3098666667938232, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.17955704033374786, "step": 2314 }, { "epoch": 0.034567974973682046, "grad_norm": 0.44921875, "grad_norm_var": 0.072506316502889, "learning_rate": 0.0001, "loss": 1.5883, "loss/crossentropy": 2.5592440366744995, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.1898176446557045, "step": 2315 }, { "epoch": 0.03458290714429703, "grad_norm": 0.392578125, "grad_norm_var": 0.07134348551432292, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.6738651990890503, "loss/fcd": 1.28515625, "loss/idx": 8.0, "loss/logits": 0.1673026829957962, "step": 2316 }, { "epoch": 0.03459783931491201, "grad_norm": 0.47265625, "grad_norm_var": 0.07113234202067058, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.358106255531311, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.17180654406547546, "step": 2317 }, { "epoch": 0.034612771485526994, "grad_norm": 0.458984375, "grad_norm_var": 0.07073642412821451, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.6107664108276367, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.20029225945472717, "step": 2318 }, { "epoch": 0.03462770365614198, "grad_norm": 0.46875, "grad_norm_var": 0.07029139200846354, "learning_rate": 0.0001, "loss": 1.6747, "loss/crossentropy": 2.5824174880981445, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20199896395206451, "step": 2319 }, { "epoch": 0.03464263582675696, "grad_norm": 0.412109375, "grad_norm_var": 0.07063482602437338, "learning_rate": 0.0001, "loss": 1.7186, "loss/crossentropy": 2.5777279138565063, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.21863004565238953, "step": 2320 }, { "epoch": 0.034657567997371935, "grad_norm": 0.423828125, "grad_norm_var": 0.07059133847554525, "learning_rate": 0.0001, "loss": 1.5442, "loss/crossentropy": 2.6421854496002197, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.180875726044178, "step": 2321 }, { "epoch": 0.03467250016798692, "grad_norm": 0.443359375, "grad_norm_var": 0.0013608296712239584, "learning_rate": 0.0001, "loss": 1.7106, "loss/crossentropy": 2.530920624732971, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.22235901653766632, "step": 2322 }, { "epoch": 0.0346874323386019, "grad_norm": 0.408203125, "grad_norm_var": 0.0013081868489583333, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.5497639179229736, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.18298085778951645, "step": 2323 }, { "epoch": 0.03470236450921688, "grad_norm": 0.427734375, "grad_norm_var": 0.0012837092081705729, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.3461703062057495, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.18147070705890656, "step": 2324 }, { "epoch": 0.034717296679831866, "grad_norm": 0.390625, "grad_norm_var": 0.0010541121164957682, "learning_rate": 0.0001, "loss": 1.5744, "loss/crossentropy": 2.6193888187408447, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.18765393644571304, "step": 2325 }, { "epoch": 0.03473222885044684, "grad_norm": 0.412109375, "grad_norm_var": 0.000965738296508789, "learning_rate": 0.0001, "loss": 1.6577, "loss/crossentropy": 2.5911881923675537, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.22018896788358688, "step": 2326 }, { "epoch": 0.034747161021061825, "grad_norm": 0.4296875, "grad_norm_var": 0.0009447574615478515, "learning_rate": 0.0001, "loss": 1.6854, "loss/crossentropy": 2.5786120891571045, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.21666912734508514, "step": 2327 }, { "epoch": 0.03476209319167681, "grad_norm": 0.4140625, "grad_norm_var": 0.0009759902954101562, "learning_rate": 0.0001, "loss": 1.7154, "loss/crossentropy": 2.5161091089248657, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.21150072664022446, "step": 2328 }, { "epoch": 0.03477702536229179, "grad_norm": 0.57421875, "grad_norm_var": 0.0019997755686442056, "learning_rate": 0.0001, "loss": 1.7521, "loss/crossentropy": 2.4698917865753174, "loss/fcd": 1.54296875, "loss/idx": 8.0, "loss/logits": 0.20915883779525757, "step": 2329 }, { "epoch": 0.03479195753290677, "grad_norm": 0.412109375, "grad_norm_var": 0.001962137222290039, "learning_rate": 0.0001, "loss": 1.5744, "loss/crossentropy": 2.6292368173599243, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.1916259080171585, "step": 2330 }, { "epoch": 0.034806889703521755, "grad_norm": 0.40625, "grad_norm_var": 0.0020068963368733723, "learning_rate": 0.0001, "loss": 1.4496, "loss/crossentropy": 2.6520153284072876, "loss/fcd": 1.28125, "loss/idx": 8.0, "loss/logits": 0.1683071404695511, "step": 2331 }, { "epoch": 0.03482182187413673, "grad_norm": 0.5390625, "grad_norm_var": 0.0025349934895833332, "learning_rate": 0.0001, "loss": 1.9506, "loss/crossentropy": 2.5273479223251343, "loss/fcd": 1.64453125, "loss/idx": 8.0, "loss/logits": 0.30611756443977356, "step": 2332 }, { "epoch": 0.034836754044751714, "grad_norm": 0.455078125, "grad_norm_var": 0.0024856408437093098, "learning_rate": 0.0001, "loss": 1.6355, "loss/crossentropy": 2.763409376144409, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.19016236066818237, "step": 2333 }, { "epoch": 0.034851686215366696, "grad_norm": 0.4296875, "grad_norm_var": 0.0024739583333333332, "learning_rate": 0.0001, "loss": 1.642, "loss/crossentropy": 2.5200676918029785, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.2045264095067978, "step": 2334 }, { "epoch": 0.03486661838598168, "grad_norm": 0.64453125, "grad_norm_var": 0.0050689061482747395, "learning_rate": 0.0001, "loss": 2.0005, "loss/crossentropy": 2.693689703941345, "loss/fcd": 1.72265625, "loss/idx": 8.0, "loss/logits": 0.2778821438550949, "step": 2335 }, { "epoch": 0.03488155055659666, "grad_norm": 0.6640625, "grad_norm_var": 0.007715972264607748, "learning_rate": 0.0001, "loss": 1.9558, "loss/crossentropy": 2.3520920276641846, "loss/fcd": 1.70703125, "loss/idx": 8.0, "loss/logits": 0.2487807795405388, "step": 2336 }, { "epoch": 0.034896482727211645, "grad_norm": 0.46484375, "grad_norm_var": 0.007584126790364584, "learning_rate": 0.0001, "loss": 1.5584, "loss/crossentropy": 2.7768971920013428, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.1989838108420372, "step": 2337 }, { "epoch": 0.03491141489782662, "grad_norm": 0.37890625, "grad_norm_var": 0.00807035764058431, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.609509229660034, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.1804806962609291, "step": 2338 }, { "epoch": 0.0349263470684416, "grad_norm": 0.486328125, "grad_norm_var": 0.007852919896443685, "learning_rate": 0.0001, "loss": 1.6588, "loss/crossentropy": 2.635384678840637, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.2056696116924286, "step": 2339 }, { "epoch": 0.034941279239056586, "grad_norm": 0.435546875, "grad_norm_var": 0.007812102635701497, "learning_rate": 0.0001, "loss": 1.6009, "loss/crossentropy": 2.5654958486557007, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.19853024184703827, "step": 2340 }, { "epoch": 0.03495621140967157, "grad_norm": 0.478515625, "grad_norm_var": 0.007352193196614583, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.6156376600265503, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.1919594705104828, "step": 2341 }, { "epoch": 0.03497114358028655, "grad_norm": 0.4609375, "grad_norm_var": 0.00708158810933431, "learning_rate": 0.0001, "loss": 1.4927, "loss/crossentropy": 2.7483454942703247, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.1802477389574051, "step": 2342 }, { "epoch": 0.03498607575090153, "grad_norm": 0.443359375, "grad_norm_var": 0.00700225830078125, "learning_rate": 0.0001, "loss": 1.549, "loss/crossentropy": 2.6593244075775146, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18570350855588913, "step": 2343 }, { "epoch": 0.03500100792151651, "grad_norm": 0.45703125, "grad_norm_var": 0.0067372004191080725, "learning_rate": 0.0001, "loss": 1.5102, "loss/crossentropy": 2.356263756752014, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.1781603991985321, "step": 2344 }, { "epoch": 0.03501594009213149, "grad_norm": 0.40625, "grad_norm_var": 0.006461079915364583, "learning_rate": 0.0001, "loss": 1.6341, "loss/crossentropy": 2.485535740852356, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.21221810579299927, "step": 2345 }, { "epoch": 0.035030872262746475, "grad_norm": 0.376953125, "grad_norm_var": 0.00682214101155599, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.496162176132202, "loss/fcd": 1.2578125, "loss/idx": 8.0, "loss/logits": 0.1621968224644661, "step": 2346 }, { "epoch": 0.03504580443336146, "grad_norm": 0.40625, "grad_norm_var": 0.00682214101155599, "learning_rate": 0.0001, "loss": 1.588, "loss/crossentropy": 2.551849365234375, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.1934635117650032, "step": 2347 }, { "epoch": 0.03506073660397644, "grad_norm": 0.515625, "grad_norm_var": 0.00664208730061849, "learning_rate": 0.0001, "loss": 1.6756, "loss/crossentropy": 2.8055641651153564, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.21465323865413666, "step": 2348 }, { "epoch": 0.035075668774591416, "grad_norm": 0.498046875, "grad_norm_var": 0.006677754720052083, "learning_rate": 0.0001, "loss": 1.621, "loss/crossentropy": 2.658348560333252, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.1913040578365326, "step": 2349 }, { "epoch": 0.0350906009452064, "grad_norm": 0.396484375, "grad_norm_var": 0.006932560602823893, "learning_rate": 0.0001, "loss": 1.5636, "loss/crossentropy": 2.5533119440078735, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.18467216938734055, "step": 2350 }, { "epoch": 0.03510553311582138, "grad_norm": 0.3828125, "grad_norm_var": 0.005109389623006185, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.5236642360687256, "loss/fcd": 1.3359375, "loss/idx": 8.0, "loss/logits": 0.18285248428583145, "step": 2351 }, { "epoch": 0.035120465286436364, "grad_norm": 0.65625, "grad_norm_var": 0.004893604914347331, "learning_rate": 0.0001, "loss": 2.073, "loss/crossentropy": 2.929765462875366, "loss/fcd": 1.73828125, "loss/idx": 8.0, "loss/logits": 0.33474333584308624, "step": 2352 }, { "epoch": 0.03513539745705135, "grad_norm": 0.5078125, "grad_norm_var": 0.005078236262003581, "learning_rate": 0.0001, "loss": 1.5243, "loss/crossentropy": 2.7334107160568237, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.16101469099521637, "step": 2353 }, { "epoch": 0.03515032962766632, "grad_norm": 0.48828125, "grad_norm_var": 0.0047097365061442055, "learning_rate": 0.0001, "loss": 1.6992, "loss/crossentropy": 2.674936294555664, "loss/fcd": 1.4921875, "loss/idx": 8.0, "loss/logits": 0.20697792619466782, "step": 2354 }, { "epoch": 0.035165261798281305, "grad_norm": 0.4296875, "grad_norm_var": 0.004728635152180989, "learning_rate": 0.0001, "loss": 1.4685, "loss/crossentropy": 2.4868372678756714, "loss/fcd": 1.30078125, "loss/idx": 8.0, "loss/logits": 0.167751744389534, "step": 2355 }, { "epoch": 0.03518019396889629, "grad_norm": 0.376953125, "grad_norm_var": 0.005124409993489583, "learning_rate": 0.0001, "loss": 1.4645, "loss/crossentropy": 2.8574626445770264, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.1597769409418106, "step": 2356 }, { "epoch": 0.03519512613951127, "grad_norm": 0.462890625, "grad_norm_var": 0.005090840657552083, "learning_rate": 0.0001, "loss": 1.5413, "loss/crossentropy": 2.950088620185852, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.19366306066513062, "step": 2357 }, { "epoch": 0.035210058310126253, "grad_norm": 0.45703125, "grad_norm_var": 0.005088233947753906, "learning_rate": 0.0001, "loss": 2.0543, "loss/crossentropy": 2.7631583213806152, "loss/fcd": 1.765625, "loss/idx": 8.0, "loss/logits": 0.2886839061975479, "step": 2358 }, { "epoch": 0.035224990480741236, "grad_norm": 0.4765625, "grad_norm_var": 0.0051106611887613935, "learning_rate": 0.0001, "loss": 1.6189, "loss/crossentropy": 2.583518624305725, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1970374658703804, "step": 2359 }, { "epoch": 0.03523992265135621, "grad_norm": 0.41015625, "grad_norm_var": 0.005241123835245768, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.583472728729248, "loss/fcd": 1.32421875, "loss/idx": 8.0, "loss/logits": 0.16427521407604218, "step": 2360 }, { "epoch": 0.035254854821971195, "grad_norm": 0.51171875, "grad_norm_var": 0.005278889338175456, "learning_rate": 0.0001, "loss": 1.6178, "loss/crossentropy": 2.7838470935821533, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1958780512213707, "step": 2361 }, { "epoch": 0.03526978699258618, "grad_norm": 0.431640625, "grad_norm_var": 0.004863214492797851, "learning_rate": 0.0001, "loss": 1.6385, "loss/crossentropy": 2.558327078819275, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.19317439198493958, "step": 2362 }, { "epoch": 0.03528471916320116, "grad_norm": 0.375, "grad_norm_var": 0.005160760879516601, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.6038241386413574, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.1839405745267868, "step": 2363 }, { "epoch": 0.03529965133381614, "grad_norm": 0.51953125, "grad_norm_var": 0.005190134048461914, "learning_rate": 0.0001, "loss": 1.8041, "loss/crossentropy": 2.4497241973876953, "loss/fcd": 1.57421875, "loss/idx": 8.0, "loss/logits": 0.2299092933535576, "step": 2364 }, { "epoch": 0.03531458350443112, "grad_norm": 0.4453125, "grad_norm_var": 0.0051055908203125, "learning_rate": 0.0001, "loss": 1.5624, "loss/crossentropy": 2.654478073120117, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.19133185595273972, "step": 2365 }, { "epoch": 0.0353295156750461, "grad_norm": 0.400390625, "grad_norm_var": 0.005074501037597656, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.5421375036239624, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.19237712770700455, "step": 2366 }, { "epoch": 0.035344447845661084, "grad_norm": 0.48046875, "grad_norm_var": 0.004688262939453125, "learning_rate": 0.0001, "loss": 1.7109, "loss/crossentropy": 2.477445602416992, "loss/fcd": 1.49609375, "loss/idx": 8.0, "loss/logits": 0.21484993398189545, "step": 2367 }, { "epoch": 0.03535938001627607, "grad_norm": 0.5, "grad_norm_var": 0.002216339111328125, "learning_rate": 0.0001, "loss": 1.6741, "loss/crossentropy": 2.310474991798401, "loss/fcd": 1.47265625, "loss/idx": 8.0, "loss/logits": 0.20147616416215897, "step": 2368 }, { "epoch": 0.03537431218689105, "grad_norm": 0.458984375, "grad_norm_var": 0.0020188490549723306, "learning_rate": 0.0001, "loss": 1.5195, "loss/crossentropy": 2.7193963527679443, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.17962011694908142, "step": 2369 }, { "epoch": 0.03538924435750603, "grad_norm": 0.37890625, "grad_norm_var": 0.0022306919097900392, "learning_rate": 0.0001, "loss": 1.6057, "loss/crossentropy": 2.2751742601394653, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.20340051501989365, "step": 2370 }, { "epoch": 0.03540417652812101, "grad_norm": 0.5078125, "grad_norm_var": 0.0024557590484619142, "learning_rate": 0.0001, "loss": 1.7258, "loss/crossentropy": 2.469887852668762, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.22584877908229828, "step": 2371 }, { "epoch": 0.03541910869873599, "grad_norm": 0.392578125, "grad_norm_var": 0.0023197015126546223, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.624801278114319, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.18326006829738617, "step": 2372 }, { "epoch": 0.03543404086935097, "grad_norm": 0.458984375, "grad_norm_var": 0.0023142337799072266, "learning_rate": 0.0001, "loss": 1.6821, "loss/crossentropy": 2.3898041248321533, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.2016235962510109, "step": 2373 }, { "epoch": 0.035448973039965956, "grad_norm": 0.466796875, "grad_norm_var": 0.002328936258951823, "learning_rate": 0.0001, "loss": 1.5886, "loss/crossentropy": 2.716990351676941, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.19794243574142456, "step": 2374 }, { "epoch": 0.03546390521058094, "grad_norm": 0.40234375, "grad_norm_var": 0.0024195353190104166, "learning_rate": 0.0001, "loss": 1.646, "loss/crossentropy": 2.563752770423889, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.22020038962364197, "step": 2375 }, { "epoch": 0.035478837381195914, "grad_norm": 0.59765625, "grad_norm_var": 0.0037134806315104168, "learning_rate": 0.0001, "loss": 1.7578, "loss/crossentropy": 2.4816360473632812, "loss/fcd": 1.52734375, "loss/idx": 8.0, "loss/logits": 0.23049044609069824, "step": 2376 }, { "epoch": 0.0354937695518109, "grad_norm": 0.50390625, "grad_norm_var": 0.003661346435546875, "learning_rate": 0.0001, "loss": 1.7231, "loss/crossentropy": 2.76643705368042, "loss/fcd": 1.5078125, "loss/idx": 8.0, "loss/logits": 0.21525081247091293, "step": 2377 }, { "epoch": 0.03550870172242588, "grad_norm": 0.36328125, "grad_norm_var": 0.004189284642537435, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.639698624610901, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.17330139130353928, "step": 2378 }, { "epoch": 0.03552363389304086, "grad_norm": 0.419921875, "grad_norm_var": 0.00384674072265625, "learning_rate": 0.0001, "loss": 1.6388, "loss/crossentropy": 2.573868989944458, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.21299321204423904, "step": 2379 }, { "epoch": 0.035538566063655845, "grad_norm": 0.455078125, "grad_norm_var": 0.0035608768463134765, "learning_rate": 0.0001, "loss": 1.6242, "loss/crossentropy": 2.652523398399353, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.20627965033054352, "step": 2380 }, { "epoch": 0.03555349823427083, "grad_norm": 0.396484375, "grad_norm_var": 0.0037535985310872396, "learning_rate": 0.0001, "loss": 1.5107, "loss/crossentropy": 2.7874321937561035, "loss/fcd": 1.32421875, "loss/idx": 8.0, "loss/logits": 0.18643464893102646, "step": 2381 }, { "epoch": 0.035568430404885804, "grad_norm": 0.435546875, "grad_norm_var": 0.0036031087239583332, "learning_rate": 0.0001, "loss": 1.7062, "loss/crossentropy": 2.648239493370056, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.2374965399503708, "step": 2382 }, { "epoch": 0.035583362575500786, "grad_norm": 0.4375, "grad_norm_var": 0.0035506566365559895, "learning_rate": 0.0001, "loss": 1.6251, "loss/crossentropy": 2.4535878896713257, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.18764664977788925, "step": 2383 }, { "epoch": 0.03559829474611577, "grad_norm": 0.40234375, "grad_norm_var": 0.0034759521484375, "learning_rate": 0.0001, "loss": 1.578, "loss/crossentropy": 2.4919214248657227, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.18350006639957428, "step": 2384 }, { "epoch": 0.03561322691673075, "grad_norm": 0.400390625, "grad_norm_var": 0.0035608291625976564, "learning_rate": 0.0001, "loss": 1.6567, "loss/crossentropy": 2.6839237213134766, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.223098523914814, "step": 2385 }, { "epoch": 0.035628159087345734, "grad_norm": 0.388671875, "grad_norm_var": 0.0034889062245686848, "learning_rate": 0.0001, "loss": 1.57, "loss/crossentropy": 2.7242919206619263, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.20277619361877441, "step": 2386 }, { "epoch": 0.03564309125796071, "grad_norm": 0.41796875, "grad_norm_var": 0.003173049290974935, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.5968987941741943, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.1700122058391571, "step": 2387 }, { "epoch": 0.03565802342857569, "grad_norm": 0.38671875, "grad_norm_var": 0.0032073338826497395, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.6420031785964966, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.19966182857751846, "step": 2388 }, { "epoch": 0.035672955599190675, "grad_norm": 0.3671875, "grad_norm_var": 0.0034202416737874348, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.440239429473877, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.1750549152493477, "step": 2389 }, { "epoch": 0.03568788776980566, "grad_norm": 0.6015625, "grad_norm_var": 0.005259450276692708, "learning_rate": 0.0001, "loss": 1.6231, "loss/crossentropy": 2.6219881772994995, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.20123480260372162, "step": 2390 }, { "epoch": 0.03570281994042064, "grad_norm": 0.40625, "grad_norm_var": 0.00524285634358724, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.721365213394165, "loss/fcd": 1.32421875, "loss/idx": 8.0, "loss/logits": 0.1829833984375, "step": 2391 }, { "epoch": 0.035717752111035624, "grad_norm": 0.380859375, "grad_norm_var": 0.0035156091054280598, "learning_rate": 0.0001, "loss": 1.5604, "loss/crossentropy": 2.7448049783706665, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.1893468052148819, "step": 2392 }, { "epoch": 0.0357326842816506, "grad_norm": 0.412109375, "grad_norm_var": 0.0030487060546875, "learning_rate": 0.0001, "loss": 1.6143, "loss/crossentropy": 2.603681802749634, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.19631332159042358, "step": 2393 }, { "epoch": 0.03574761645226558, "grad_norm": 0.37890625, "grad_norm_var": 0.0029520670572916667, "learning_rate": 0.0001, "loss": 1.5122, "loss/crossentropy": 2.29004442691803, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.1645762249827385, "step": 2394 }, { "epoch": 0.035762548622880565, "grad_norm": 0.455078125, "grad_norm_var": 0.003038469950358073, "learning_rate": 0.0001, "loss": 1.6274, "loss/crossentropy": 2.7295596599578857, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.19382072985172272, "step": 2395 }, { "epoch": 0.03577748079349555, "grad_norm": 0.45703125, "grad_norm_var": 0.003047800064086914, "learning_rate": 0.0001, "loss": 1.6292, "loss/crossentropy": 2.6545112133026123, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.20339828729629517, "step": 2396 }, { "epoch": 0.03579241296411053, "grad_norm": 0.46484375, "grad_norm_var": 0.003122901916503906, "learning_rate": 0.0001, "loss": 1.6349, "loss/crossentropy": 2.7580233812332153, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.1935306042432785, "step": 2397 }, { "epoch": 0.03580734513472551, "grad_norm": 0.375, "grad_norm_var": 0.0032633304595947265, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.6720550060272217, "loss/fcd": 1.31640625, "loss/idx": 8.0, "loss/logits": 0.16701490432024002, "step": 2398 }, { "epoch": 0.03582227730534049, "grad_norm": 0.76171875, "grad_norm_var": 0.01055614153544108, "learning_rate": 0.0001, "loss": 1.7776, "loss/crossentropy": 3.1363632678985596, "loss/fcd": 1.60546875, "loss/idx": 8.0, "loss/logits": 0.17216653376817703, "step": 2399 }, { "epoch": 0.03583720947595547, "grad_norm": 0.70703125, "grad_norm_var": 0.014786259333292643, "learning_rate": 0.0001, "loss": 1.9228, "loss/crossentropy": 2.8044469356536865, "loss/fcd": 1.65625, "loss/idx": 8.0, "loss/logits": 0.2665247991681099, "step": 2400 }, { "epoch": 0.035852141646570454, "grad_norm": 0.458984375, "grad_norm_var": 0.014534489313761393, "learning_rate": 0.0001, "loss": 1.5444, "loss/crossentropy": 2.6123517751693726, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18109259009361267, "step": 2401 }, { "epoch": 0.03586707381718544, "grad_norm": 0.41015625, "grad_norm_var": 0.014348284403483073, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.6397366523742676, "loss/fcd": 1.2734375, "loss/idx": 8.0, "loss/logits": 0.16508978605270386, "step": 2402 }, { "epoch": 0.03588200598780042, "grad_norm": 0.462890625, "grad_norm_var": 0.01419218381245931, "learning_rate": 0.0001, "loss": 1.5746, "loss/crossentropy": 2.7463337182998657, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.18785084784030914, "step": 2403 }, { "epoch": 0.035896938158415395, "grad_norm": 0.51171875, "grad_norm_var": 0.013815800348917643, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.500896692276001, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.17382127791643143, "step": 2404 }, { "epoch": 0.03591187032903038, "grad_norm": 0.42578125, "grad_norm_var": 0.013182560602823893, "learning_rate": 0.0001, "loss": 1.5376, "loss/crossentropy": 2.501421809196472, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.18601687252521515, "step": 2405 }, { "epoch": 0.03592680249964536, "grad_norm": 0.46875, "grad_norm_var": 0.012121184666951498, "learning_rate": 0.0001, "loss": 1.6274, "loss/crossentropy": 2.4910330772399902, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.18988292664289474, "step": 2406 }, { "epoch": 0.03594173467026034, "grad_norm": 0.369140625, "grad_norm_var": 0.012527974446614583, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.663501262664795, "loss/fcd": 1.27734375, "loss/idx": 8.0, "loss/logits": 0.1658233478665352, "step": 2407 }, { "epoch": 0.035956666840875326, "grad_norm": 0.384765625, "grad_norm_var": 0.012483151753743489, "learning_rate": 0.0001, "loss": 1.5869, "loss/crossentropy": 2.6975330114364624, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.2040604203939438, "step": 2408 }, { "epoch": 0.03597159901149031, "grad_norm": 0.5, "grad_norm_var": 0.012299331029256184, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.578181266784668, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.19546962529420853, "step": 2409 }, { "epoch": 0.035986531182105284, "grad_norm": 0.4140625, "grad_norm_var": 0.011928542455037435, "learning_rate": 0.0001, "loss": 1.6768, "loss/crossentropy": 2.545762300491333, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.2080402597784996, "step": 2410 }, { "epoch": 0.03600146335272027, "grad_norm": 0.388671875, "grad_norm_var": 0.012395461400349935, "learning_rate": 0.0001, "loss": 1.5352, "loss/crossentropy": 2.4450128078460693, "loss/fcd": 1.35546875, "loss/idx": 8.0, "loss/logits": 0.17971044033765793, "step": 2411 }, { "epoch": 0.03601639552333525, "grad_norm": 0.416015625, "grad_norm_var": 0.01258538564046224, "learning_rate": 0.0001, "loss": 1.5208, "loss/crossentropy": 2.646558165550232, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.1809879019856453, "step": 2412 }, { "epoch": 0.03603132769395023, "grad_norm": 0.45703125, "grad_norm_var": 0.01259454091389974, "learning_rate": 0.0001, "loss": 1.5546, "loss/crossentropy": 2.762848734855652, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.19518350064754486, "step": 2413 }, { "epoch": 0.036046259864565215, "grad_norm": 0.5078125, "grad_norm_var": 0.012023862202962239, "learning_rate": 0.0001, "loss": 1.7267, "loss/crossentropy": 2.5716902017593384, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.22281979024410248, "step": 2414 }, { "epoch": 0.03606119203518019, "grad_norm": 0.419921875, "grad_norm_var": 0.006385660171508789, "learning_rate": 0.0001, "loss": 1.4823, "loss/crossentropy": 2.5244510173797607, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.17756371200084686, "step": 2415 }, { "epoch": 0.036076124205795174, "grad_norm": 0.4921875, "grad_norm_var": 0.0020915826161702475, "learning_rate": 0.0001, "loss": 1.7898, "loss/crossentropy": 2.7621735334396362, "loss/fcd": 1.546875, "loss/idx": 8.0, "loss/logits": 0.24293024837970734, "step": 2416 }, { "epoch": 0.036091056376410156, "grad_norm": 0.42578125, "grad_norm_var": 0.002089691162109375, "learning_rate": 0.0001, "loss": 1.6378, "loss/crossentropy": 2.53537380695343, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.21593287587165833, "step": 2417 }, { "epoch": 0.03610598854702514, "grad_norm": 0.375, "grad_norm_var": 0.0023111343383789063, "learning_rate": 0.0001, "loss": 1.6042, "loss/crossentropy": 2.58733594417572, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.20184049010276794, "step": 2418 }, { "epoch": 0.03612092071764012, "grad_norm": 0.455078125, "grad_norm_var": 0.0022897720336914062, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.623771548271179, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.19256125390529633, "step": 2419 }, { "epoch": 0.036135852888255104, "grad_norm": 0.42578125, "grad_norm_var": 0.0019093195597330729, "learning_rate": 0.0001, "loss": 1.6452, "loss/crossentropy": 2.560054063796997, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.19992457330226898, "step": 2420 }, { "epoch": 0.03615078505887008, "grad_norm": 0.43359375, "grad_norm_var": 0.0019057591756184895, "learning_rate": 0.0001, "loss": 1.6366, "loss/crossentropy": 2.6419034004211426, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.21079695969820023, "step": 2421 }, { "epoch": 0.03616571722948506, "grad_norm": 0.408203125, "grad_norm_var": 0.0018490950266520183, "learning_rate": 0.0001, "loss": 1.6542, "loss/crossentropy": 2.4584015607833862, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.2244865968823433, "step": 2422 }, { "epoch": 0.036180649400100046, "grad_norm": 0.4921875, "grad_norm_var": 0.001804033915201823, "learning_rate": 0.0001, "loss": 1.8274, "loss/crossentropy": 2.6395381689071655, "loss/fcd": 1.57421875, "loss/idx": 8.0, "loss/logits": 0.253149077296257, "step": 2423 }, { "epoch": 0.03619558157071503, "grad_norm": 0.6953125, "grad_norm_var": 0.005658070246378581, "learning_rate": 0.0001, "loss": 1.8956, "loss/crossentropy": 2.587922215461731, "loss/fcd": 1.67578125, "loss/idx": 8.0, "loss/logits": 0.21982329338788986, "step": 2424 }, { "epoch": 0.03621051374133001, "grad_norm": 0.55078125, "grad_norm_var": 0.00611265500386556, "learning_rate": 0.0001, "loss": 1.9242, "loss/crossentropy": 2.7731103897094727, "loss/fcd": 1.67578125, "loss/idx": 8.0, "loss/logits": 0.24844194203615189, "step": 2425 }, { "epoch": 0.03622544591194499, "grad_norm": 0.451171875, "grad_norm_var": 0.005972226460774739, "learning_rate": 0.0001, "loss": 1.6367, "loss/crossentropy": 2.398366332054138, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.195280559360981, "step": 2426 }, { "epoch": 0.03624037808255997, "grad_norm": 0.4375, "grad_norm_var": 0.005642811457316081, "learning_rate": 0.0001, "loss": 1.6849, "loss/crossentropy": 2.7583937644958496, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.2279180884361267, "step": 2427 }, { "epoch": 0.03625531025317495, "grad_norm": 0.369140625, "grad_norm_var": 0.006087605158487956, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.5390326976776123, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.19071626663208008, "step": 2428 }, { "epoch": 0.036270242423789935, "grad_norm": 0.4921875, "grad_norm_var": 0.0061402479807535805, "learning_rate": 0.0001, "loss": 1.8348, "loss/crossentropy": 2.642075777053833, "loss/fcd": 1.59765625, "loss/idx": 8.0, "loss/logits": 0.23718450218439102, "step": 2429 }, { "epoch": 0.03628517459440492, "grad_norm": 0.48046875, "grad_norm_var": 0.006028985977172852, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.7885069847106934, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.19175666570663452, "step": 2430 }, { "epoch": 0.0363001067650199, "grad_norm": 0.419921875, "grad_norm_var": 0.006028985977172852, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.4662705659866333, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.1932341679930687, "step": 2431 }, { "epoch": 0.036315038935634876, "grad_norm": 0.390625, "grad_norm_var": 0.006275288263956706, "learning_rate": 0.0001, "loss": 1.5873, "loss/crossentropy": 2.4667434692382812, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.19669387489557266, "step": 2432 }, { "epoch": 0.03632997110624986, "grad_norm": 0.44140625, "grad_norm_var": 0.0062267144521077475, "learning_rate": 0.0001, "loss": 1.5762, "loss/crossentropy": 2.6293481588363647, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.18559803813695908, "step": 2433 }, { "epoch": 0.03634490327686484, "grad_norm": 0.486328125, "grad_norm_var": 0.005778249104817708, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.606359839439392, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.19685593247413635, "step": 2434 }, { "epoch": 0.036359835447479824, "grad_norm": 0.390625, "grad_norm_var": 0.006117614110310873, "learning_rate": 0.0001, "loss": 1.6239, "loss/crossentropy": 2.3494625091552734, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.19811765104532242, "step": 2435 }, { "epoch": 0.03637476761809481, "grad_norm": 0.48828125, "grad_norm_var": 0.006073872248331706, "learning_rate": 0.0001, "loss": 1.8717, "loss/crossentropy": 2.609783411026001, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.2857333719730377, "step": 2436 }, { "epoch": 0.03638969978870978, "grad_norm": 0.4296875, "grad_norm_var": 0.006090784072875976, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.6469011306762695, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.19910111278295517, "step": 2437 }, { "epoch": 0.036404631959324765, "grad_norm": 0.455078125, "grad_norm_var": 0.005879449844360352, "learning_rate": 0.0001, "loss": 1.6034, "loss/crossentropy": 2.5134408473968506, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.18538911640644073, "step": 2438 }, { "epoch": 0.03641956412993975, "grad_norm": 0.41796875, "grad_norm_var": 0.005973672866821289, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.53541100025177, "loss/fcd": 1.32421875, "loss/idx": 8.0, "loss/logits": 0.1683252528309822, "step": 2439 }, { "epoch": 0.03643449630055473, "grad_norm": 0.431640625, "grad_norm_var": 0.002126312255859375, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.6855000257492065, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.18252353370189667, "step": 2440 }, { "epoch": 0.03644942847116971, "grad_norm": 0.625, "grad_norm_var": 0.0035094579060872396, "learning_rate": 0.0001, "loss": 1.9808, "loss/crossentropy": 2.836515188217163, "loss/fcd": 1.671875, "loss/idx": 8.0, "loss/logits": 0.3089062571525574, "step": 2441 }, { "epoch": 0.036464360641784696, "grad_norm": 0.46875, "grad_norm_var": 0.003530486424763997, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.5589300394058228, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18417616933584213, "step": 2442 }, { "epoch": 0.03647929281239967, "grad_norm": 0.400390625, "grad_norm_var": 0.0036860148111979166, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.5068148374557495, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.1926576867699623, "step": 2443 }, { "epoch": 0.036494224983014655, "grad_norm": 0.4140625, "grad_norm_var": 0.0033325036366780598, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.701190233230591, "loss/fcd": 1.296875, "loss/idx": 8.0, "loss/logits": 0.17387530952692032, "step": 2444 }, { "epoch": 0.03650915715362964, "grad_norm": 0.3671875, "grad_norm_var": 0.0036397139231363934, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.7218198776245117, "loss/fcd": 1.30078125, "loss/idx": 8.0, "loss/logits": 0.16764816641807556, "step": 2445 }, { "epoch": 0.03652408932424462, "grad_norm": 0.423828125, "grad_norm_var": 0.0035664240519205728, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.771889328956604, "loss/fcd": 1.35546875, "loss/idx": 8.0, "loss/logits": 0.1804812252521515, "step": 2446 }, { "epoch": 0.0365390214948596, "grad_norm": 0.47265625, "grad_norm_var": 0.0035943190256754556, "learning_rate": 0.0001, "loss": 1.7449, "loss/crossentropy": 2.6609978675842285, "loss/fcd": 1.51953125, "loss/idx": 8.0, "loss/logits": 0.22540705651044846, "step": 2447 }, { "epoch": 0.03655395366547458, "grad_norm": 0.419921875, "grad_norm_var": 0.0034395853678385415, "learning_rate": 0.0001, "loss": 1.5619, "loss/crossentropy": 2.814198136329651, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.19082403182983398, "step": 2448 }, { "epoch": 0.03656888583608956, "grad_norm": 0.38671875, "grad_norm_var": 0.003658548990885417, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.5829941034317017, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.17865874618291855, "step": 2449 }, { "epoch": 0.036583818006704544, "grad_norm": 0.4375, "grad_norm_var": 0.0035214583079020184, "learning_rate": 0.0001, "loss": 1.5501, "loss/crossentropy": 2.6017762422561646, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.18295357376337051, "step": 2450 }, { "epoch": 0.036598750177319526, "grad_norm": 0.7109375, "grad_norm_var": 0.007853809992472332, "learning_rate": 0.0001, "loss": 1.832, "loss/crossentropy": 2.8203554153442383, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.24601604044437408, "step": 2451 }, { "epoch": 0.03661368234793451, "grad_norm": 0.447265625, "grad_norm_var": 0.007800738016764323, "learning_rate": 0.0001, "loss": 1.62, "loss/crossentropy": 2.6139018535614014, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.19809486716985703, "step": 2452 }, { "epoch": 0.03662861451854949, "grad_norm": 0.3984375, "grad_norm_var": 0.007974688212076824, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.5488556623458862, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.17262020707130432, "step": 2453 }, { "epoch": 0.03664354668916447, "grad_norm": 0.412109375, "grad_norm_var": 0.00808868408203125, "learning_rate": 0.0001, "loss": 1.625, "loss/crossentropy": 2.242365598678589, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.19529011845588684, "step": 2454 }, { "epoch": 0.03665847885977945, "grad_norm": 0.55859375, "grad_norm_var": 0.00868377685546875, "learning_rate": 0.0001, "loss": 1.6125, "loss/crossentropy": 2.603208303451538, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.19063596427440643, "step": 2455 }, { "epoch": 0.03667341103039443, "grad_norm": 0.41015625, "grad_norm_var": 0.00879654884338379, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.575288772583008, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.17959459125995636, "step": 2456 }, { "epoch": 0.036688343201009416, "grad_norm": 0.75390625, "grad_norm_var": 0.012678003311157227, "learning_rate": 0.0001, "loss": 1.7458, "loss/crossentropy": 2.4697948694229126, "loss/fcd": 1.55078125, "loss/idx": 8.0, "loss/logits": 0.19501008093357086, "step": 2457 }, { "epoch": 0.0367032753716244, "grad_norm": 0.44140625, "grad_norm_var": 0.012720727920532226, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.5631171464920044, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.18353476375341415, "step": 2458 }, { "epoch": 0.036718207542239374, "grad_norm": 0.5625, "grad_norm_var": 0.012946319580078126, "learning_rate": 0.0001, "loss": 1.6777, "loss/crossentropy": 2.3507306575775146, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.18942810595035553, "step": 2459 }, { "epoch": 0.03673313971285436, "grad_norm": 0.4296875, "grad_norm_var": 0.012832387288411459, "learning_rate": 0.0001, "loss": 1.6813, "loss/crossentropy": 2.6251943111419678, "loss/fcd": 1.46484375, "loss/idx": 8.0, "loss/logits": 0.21645274013280869, "step": 2460 }, { "epoch": 0.03674807188346934, "grad_norm": 0.455078125, "grad_norm_var": 0.012027724583943685, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.518959403038025, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.17499826848506927, "step": 2461 }, { "epoch": 0.03676300405408432, "grad_norm": 0.5703125, "grad_norm_var": 0.012222035725911459, "learning_rate": 0.0001, "loss": 1.6977, "loss/crossentropy": 2.834190011024475, "loss/fcd": 1.484375, "loss/idx": 8.0, "loss/logits": 0.2132972851395607, "step": 2462 }, { "epoch": 0.036777936224699305, "grad_norm": 0.439453125, "grad_norm_var": 0.012375243504842122, "learning_rate": 0.0001, "loss": 1.6991, "loss/crossentropy": 2.4817334413528442, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.22251743078231812, "step": 2463 }, { "epoch": 0.03679286839531429, "grad_norm": 0.447265625, "grad_norm_var": 0.012167851130167643, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.6811128854751587, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.19943107664585114, "step": 2464 }, { "epoch": 0.03680780056592926, "grad_norm": 0.44140625, "grad_norm_var": 0.011591958999633788, "learning_rate": 0.0001, "loss": 1.8312, "loss/crossentropy": 2.627020001411438, "loss/fcd": 1.5625, "loss/idx": 8.0, "loss/logits": 0.26874975860118866, "step": 2465 }, { "epoch": 0.036822732736544246, "grad_norm": 0.443359375, "grad_norm_var": 0.01154937744140625, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 2.44194757938385, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.183101624250412, "step": 2466 }, { "epoch": 0.03683766490715923, "grad_norm": 0.79296875, "grad_norm_var": 0.014330482482910157, "learning_rate": 0.0001, "loss": 1.834, "loss/crossentropy": 2.760730504989624, "loss/fcd": 1.60546875, "loss/idx": 8.0, "loss/logits": 0.22855059057474136, "step": 2467 }, { "epoch": 0.03685259707777421, "grad_norm": 0.56640625, "grad_norm_var": 0.014376052220662435, "learning_rate": 0.0001, "loss": 1.8207, "loss/crossentropy": 2.304331660270691, "loss/fcd": 1.60546875, "loss/idx": 8.0, "loss/logits": 0.21518544852733612, "step": 2468 }, { "epoch": 0.036867529248389194, "grad_norm": 0.474609375, "grad_norm_var": 0.01362908681233724, "learning_rate": 0.0001, "loss": 1.5803, "loss/crossentropy": 2.814204454421997, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.19357898831367493, "step": 2469 }, { "epoch": 0.03688246141900418, "grad_norm": 0.486328125, "grad_norm_var": 0.012980397542317708, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.736605644226074, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.18949368596076965, "step": 2470 }, { "epoch": 0.03689739358961915, "grad_norm": 0.466796875, "grad_norm_var": 0.01299907366434733, "learning_rate": 0.0001, "loss": 1.6223, "loss/crossentropy": 2.5802935361862183, "loss/fcd": 1.43359375, "loss/idx": 8.0, "loss/logits": 0.18868596851825714, "step": 2471 }, { "epoch": 0.036912325760234135, "grad_norm": 0.443359375, "grad_norm_var": 0.012619972229003906, "learning_rate": 0.0001, "loss": 1.5275, "loss/crossentropy": 2.7563517093658447, "loss/fcd": 1.34375, "loss/idx": 8.0, "loss/logits": 0.18372995406389236, "step": 2472 }, { "epoch": 0.03692725793084912, "grad_norm": 0.55078125, "grad_norm_var": 0.008685747782389322, "learning_rate": 0.0001, "loss": 1.776, "loss/crossentropy": 2.459903836250305, "loss/fcd": 1.5625, "loss/idx": 8.0, "loss/logits": 0.21354176849126816, "step": 2473 }, { "epoch": 0.0369421901014641, "grad_norm": 0.447265625, "grad_norm_var": 0.008641544977823894, "learning_rate": 0.0001, "loss": 1.6326, "loss/crossentropy": 2.761512041091919, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.21070978045463562, "step": 2474 }, { "epoch": 0.03695712227207908, "grad_norm": 0.392578125, "grad_norm_var": 0.009055010477701823, "learning_rate": 0.0001, "loss": 1.4983, "loss/crossentropy": 2.6376800537109375, "loss/fcd": 1.3203125, "loss/idx": 8.0, "loss/logits": 0.17796095460653305, "step": 2475 }, { "epoch": 0.03697205444269406, "grad_norm": 0.482421875, "grad_norm_var": 0.00880138079325358, "learning_rate": 0.0001, "loss": 1.6741, "loss/crossentropy": 2.7889968156814575, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.21319925785064697, "step": 2476 }, { "epoch": 0.03698698661330904, "grad_norm": 0.43359375, "grad_norm_var": 0.008941078186035156, "learning_rate": 0.0001, "loss": 1.7714, "loss/crossentropy": 2.352063298225403, "loss/fcd": 1.5390625, "loss/idx": 8.0, "loss/logits": 0.23237252980470657, "step": 2477 }, { "epoch": 0.037001918783924025, "grad_norm": 0.388671875, "grad_norm_var": 0.00911698341369629, "learning_rate": 0.0001, "loss": 1.5519, "loss/crossentropy": 2.636815667152405, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.2003871649503708, "step": 2478 }, { "epoch": 0.03701685095453901, "grad_norm": 0.37890625, "grad_norm_var": 0.009682146708170573, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.606695294380188, "loss/fcd": 1.31640625, "loss/idx": 8.0, "loss/logits": 0.18933551013469696, "step": 2479 }, { "epoch": 0.03703178312515399, "grad_norm": 0.41796875, "grad_norm_var": 0.009853092829386394, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.475183367729187, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.18217281997203827, "step": 2480 }, { "epoch": 0.03704671529576897, "grad_norm": 0.51171875, "grad_norm_var": 0.009842793146769205, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.919053792953491, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.20467495173215866, "step": 2481 }, { "epoch": 0.03706164746638395, "grad_norm": 0.369140625, "grad_norm_var": 0.010548257827758789, "learning_rate": 0.0001, "loss": 1.5358, "loss/crossentropy": 2.5359466075897217, "loss/fcd": 1.34765625, "loss/idx": 8.0, "loss/logits": 0.18813680112361908, "step": 2482 }, { "epoch": 0.03707657963699893, "grad_norm": 0.4453125, "grad_norm_var": 0.0033733208974202475, "learning_rate": 0.0001, "loss": 1.5598, "loss/crossentropy": 2.768907070159912, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.20045947283506393, "step": 2483 }, { "epoch": 0.037091511807613914, "grad_norm": 0.453125, "grad_norm_var": 0.0024698734283447265, "learning_rate": 0.0001, "loss": 1.56, "loss/crossentropy": 2.478017568588257, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.1889248639345169, "step": 2484 }, { "epoch": 0.0371064439782289, "grad_norm": 0.33984375, "grad_norm_var": 0.003098297119140625, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.5096648931503296, "loss/fcd": 1.21484375, "loss/idx": 8.0, "loss/logits": 0.14483103156089783, "step": 2485 }, { "epoch": 0.03712137614884388, "grad_norm": 0.474609375, "grad_norm_var": 0.003031349182128906, "learning_rate": 0.0001, "loss": 1.7335, "loss/crossentropy": 2.445379853248596, "loss/fcd": 1.51953125, "loss/idx": 8.0, "loss/logits": 0.213978573679924, "step": 2486 }, { "epoch": 0.037136308319458855, "grad_norm": 0.443359375, "grad_norm_var": 0.0029733657836914064, "learning_rate": 0.0001, "loss": 1.8088, "loss/crossentropy": 2.645754098892212, "loss/fcd": 1.58203125, "loss/idx": 8.0, "loss/logits": 0.22676464915275574, "step": 2487 }, { "epoch": 0.03715124049007384, "grad_norm": 0.419921875, "grad_norm_var": 0.0029840469360351562, "learning_rate": 0.0001, "loss": 1.7823, "loss/crossentropy": 2.3314541578292847, "loss/fcd": 1.55078125, "loss/idx": 8.0, "loss/logits": 0.2314802035689354, "step": 2488 }, { "epoch": 0.03716617266068882, "grad_norm": 0.423828125, "grad_norm_var": 0.002020120620727539, "learning_rate": 0.0001, "loss": 1.6156, "loss/crossentropy": 2.4767589569091797, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1937345713376999, "step": 2489 }, { "epoch": 0.0371811048313038, "grad_norm": 0.40625, "grad_norm_var": 0.0020111083984375, "learning_rate": 0.0001, "loss": 1.3776, "loss/crossentropy": 2.6839234828948975, "loss/fcd": 1.2265625, "loss/idx": 8.0, "loss/logits": 0.1510133594274521, "step": 2490 }, { "epoch": 0.037196037001918786, "grad_norm": 0.412109375, "grad_norm_var": 0.001953570048014323, "learning_rate": 0.0001, "loss": 1.6522, "loss/crossentropy": 2.5977165699005127, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.2108018845319748, "step": 2491 }, { "epoch": 0.03721096917253377, "grad_norm": 0.3828125, "grad_norm_var": 0.0018117109934488933, "learning_rate": 0.0001, "loss": 1.4397, "loss/crossentropy": 2.5301756858825684, "loss/fcd": 1.27734375, "loss/idx": 8.0, "loss/logits": 0.16231189668178558, "step": 2492 }, { "epoch": 0.037225901343148744, "grad_norm": 0.44921875, "grad_norm_var": 0.001857741673787435, "learning_rate": 0.0001, "loss": 1.6069, "loss/crossentropy": 2.4979443550109863, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.18506374210119247, "step": 2493 }, { "epoch": 0.03724083351376373, "grad_norm": 0.484375, "grad_norm_var": 0.002032979329427083, "learning_rate": 0.0001, "loss": 1.6166, "loss/crossentropy": 2.3945562839508057, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.18693313747644424, "step": 2494 }, { "epoch": 0.03725576568437871, "grad_norm": 0.431640625, "grad_norm_var": 0.0018771966298421225, "learning_rate": 0.0001, "loss": 1.7185, "loss/crossentropy": 2.656275987625122, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.21848513185977936, "step": 2495 }, { "epoch": 0.03727069785499369, "grad_norm": 0.427734375, "grad_norm_var": 0.0018686930338541667, "learning_rate": 0.0001, "loss": 1.5719, "loss/crossentropy": 2.606783390045166, "loss/fcd": 1.39453125, "loss/idx": 8.0, "loss/logits": 0.17736031860113144, "step": 2496 }, { "epoch": 0.037285630025608675, "grad_norm": 0.40625, "grad_norm_var": 0.0014103571573893228, "learning_rate": 0.0001, "loss": 1.6864, "loss/crossentropy": 2.565021276473999, "loss/fcd": 1.46875, "loss/idx": 8.0, "loss/logits": 0.2176225557923317, "step": 2497 }, { "epoch": 0.03730056219622365, "grad_norm": 0.408203125, "grad_norm_var": 0.0012247085571289063, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.6501970291137695, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.1720181331038475, "step": 2498 }, { "epoch": 0.037315494366838634, "grad_norm": 0.404296875, "grad_norm_var": 0.0012217044830322265, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.739599823951721, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.19616509974002838, "step": 2499 }, { "epoch": 0.037330426537453616, "grad_norm": 0.435546875, "grad_norm_var": 0.00117034912109375, "learning_rate": 0.0001, "loss": 1.746, "loss/crossentropy": 2.598976731300354, "loss/fcd": 1.53125, "loss/idx": 8.0, "loss/logits": 0.21473443508148193, "step": 2500 }, { "epoch": 0.0373453587080686, "grad_norm": 0.439453125, "grad_norm_var": 0.000700998306274414, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.4935563802719116, "loss/fcd": 1.35546875, "loss/idx": 8.0, "loss/logits": 0.1919759139418602, "step": 2501 }, { "epoch": 0.03736029087868358, "grad_norm": 0.41796875, "grad_norm_var": 0.0005502700805664062, "learning_rate": 0.0001, "loss": 1.6283, "loss/crossentropy": 2.5674413442611694, "loss/fcd": 1.4296875, "loss/idx": 8.0, "loss/logits": 0.198564812541008, "step": 2502 }, { "epoch": 0.037375223049298564, "grad_norm": 0.3984375, "grad_norm_var": 0.0005637963612874348, "learning_rate": 0.0001, "loss": 1.554, "loss/crossentropy": 2.56048047542572, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.17898903042078018, "step": 2503 }, { "epoch": 0.03739015521991354, "grad_norm": 0.4453125, "grad_norm_var": 0.0005978902180989583, "learning_rate": 0.0001, "loss": 1.5835, "loss/crossentropy": 2.824795365333557, "loss/fcd": 1.3828125, "loss/idx": 8.0, "loss/logits": 0.20069345086812973, "step": 2504 }, { "epoch": 0.03740508739052852, "grad_norm": 0.3984375, "grad_norm_var": 0.0006365299224853516, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.4702374935150146, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.19932619482278824, "step": 2505 }, { "epoch": 0.037420019561143505, "grad_norm": 0.416015625, "grad_norm_var": 0.0006223042805989583, "learning_rate": 0.0001, "loss": 1.6636, "loss/crossentropy": 2.6721051931381226, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.22214733064174652, "step": 2506 }, { "epoch": 0.03743495173175849, "grad_norm": 0.42578125, "grad_norm_var": 0.0006152947743733723, "learning_rate": 0.0001, "loss": 1.52, "loss/crossentropy": 2.5858603715896606, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.18019453436136246, "step": 2507 }, { "epoch": 0.03744988390237347, "grad_norm": 0.3984375, "grad_norm_var": 0.0005463759104410807, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.4698894023895264, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.17874377965927124, "step": 2508 }, { "epoch": 0.03746481607298845, "grad_norm": 0.421875, "grad_norm_var": 0.0005018711090087891, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.572903871536255, "loss/fcd": 1.328125, "loss/idx": 8.0, "loss/logits": 0.16111696511507034, "step": 2509 }, { "epoch": 0.03747974824360343, "grad_norm": 0.412109375, "grad_norm_var": 0.00023193359375, "learning_rate": 0.0001, "loss": 1.5834, "loss/crossentropy": 2.574868321418762, "loss/fcd": 1.390625, "loss/idx": 8.0, "loss/logits": 0.19272977858781815, "step": 2510 }, { "epoch": 0.03749468041421841, "grad_norm": 0.5546875, "grad_norm_var": 0.0014025211334228516, "learning_rate": 0.0001, "loss": 1.8305, "loss/crossentropy": 2.631860375404358, "loss/fcd": 1.58203125, "loss/idx": 8.0, "loss/logits": 0.248466357588768, "step": 2511 }, { "epoch": 0.037509612584833395, "grad_norm": 0.400390625, "grad_norm_var": 0.0014416853586832683, "learning_rate": 0.0001, "loss": 1.5604, "loss/crossentropy": 2.9449307918548584, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.18537471443414688, "step": 2512 }, { "epoch": 0.03752454475544838, "grad_norm": 0.41015625, "grad_norm_var": 0.001433420181274414, "learning_rate": 0.0001, "loss": 1.7331, "loss/crossentropy": 2.5279860496520996, "loss/fcd": 1.5, "loss/idx": 8.0, "loss/logits": 0.23310866951942444, "step": 2513 }, { "epoch": 0.03753947692606336, "grad_norm": 0.447265625, "grad_norm_var": 0.0014455000559488933, "learning_rate": 0.0001, "loss": 1.6653, "loss/crossentropy": 2.8770030736923218, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.21603310108184814, "step": 2514 }, { "epoch": 0.037554409096678336, "grad_norm": 0.42578125, "grad_norm_var": 0.0014103571573893228, "learning_rate": 0.0001, "loss": 1.6294, "loss/crossentropy": 2.119299829006195, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.20358960330486298, "step": 2515 }, { "epoch": 0.03756934126729332, "grad_norm": 0.44140625, "grad_norm_var": 0.0014184157053629557, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.6470255851745605, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.16488387435674667, "step": 2516 }, { "epoch": 0.0375842734379083, "grad_norm": 0.392578125, "grad_norm_var": 0.0014863173166910807, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.7195645570755005, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.17930680513381958, "step": 2517 }, { "epoch": 0.037599205608523284, "grad_norm": 0.400390625, "grad_norm_var": 0.0015230814615885416, "learning_rate": 0.0001, "loss": 1.4912, "loss/crossentropy": 2.5517027378082275, "loss/fcd": 1.3125, "loss/idx": 8.0, "loss/logits": 0.1787310391664505, "step": 2518 }, { "epoch": 0.03761413777913827, "grad_norm": 0.486328125, "grad_norm_var": 0.0017026106516520182, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.7282869815826416, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.19659479707479477, "step": 2519 }, { "epoch": 0.03762906994975324, "grad_norm": 0.4765625, "grad_norm_var": 0.0018282413482666015, "learning_rate": 0.0001, "loss": 1.6155, "loss/crossentropy": 2.6164207458496094, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.20148704200983047, "step": 2520 }, { "epoch": 0.037644002120368225, "grad_norm": 0.4375, "grad_norm_var": 0.001750040054321289, "learning_rate": 0.0001, "loss": 1.4869, "loss/crossentropy": 2.6544848680496216, "loss/fcd": 1.30859375, "loss/idx": 8.0, "loss/logits": 0.1783059760928154, "step": 2521 }, { "epoch": 0.03765893429098321, "grad_norm": 0.44140625, "grad_norm_var": 0.0017287572224934896, "learning_rate": 0.0001, "loss": 1.6681, "loss/crossentropy": 2.4938647747039795, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.20719844847917557, "step": 2522 }, { "epoch": 0.03767386646159819, "grad_norm": 0.44921875, "grad_norm_var": 0.0017318089803059896, "learning_rate": 0.0001, "loss": 1.5211, "loss/crossentropy": 2.5113366842269897, "loss/fcd": 1.35546875, "loss/idx": 8.0, "loss/logits": 0.16562466323375702, "step": 2523 }, { "epoch": 0.03768879863221317, "grad_norm": 0.67578125, "grad_norm_var": 0.0051038106282552086, "learning_rate": 0.0001, "loss": 1.6089, "loss/crossentropy": 2.574048638343811, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.18702325224876404, "step": 2524 }, { "epoch": 0.037703730802828156, "grad_norm": 0.515625, "grad_norm_var": 0.005244191487630208, "learning_rate": 0.0001, "loss": 1.7969, "loss/crossentropy": 2.48725688457489, "loss/fcd": 1.58203125, "loss/idx": 8.0, "loss/logits": 0.21484121680259705, "step": 2525 }, { "epoch": 0.03771866297344313, "grad_norm": 0.5546875, "grad_norm_var": 0.005595763524373372, "learning_rate": 0.0001, "loss": 1.6436, "loss/crossentropy": 2.598286271095276, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.21781045198440552, "step": 2526 }, { "epoch": 0.037733595144058114, "grad_norm": 0.458984375, "grad_norm_var": 0.005079396565755208, "learning_rate": 0.0001, "loss": 1.7083, "loss/crossentropy": 2.3635621070861816, "loss/fcd": 1.50390625, "loss/idx": 8.0, "loss/logits": 0.20443390309810638, "step": 2527 }, { "epoch": 0.0377485273146731, "grad_norm": 0.443359375, "grad_norm_var": 0.00483392079671224, "learning_rate": 0.0001, "loss": 1.6467, "loss/crossentropy": 2.5126349925994873, "loss/fcd": 1.44140625, "loss/idx": 8.0, "loss/logits": 0.2053196057677269, "step": 2528 }, { "epoch": 0.03776345948528808, "grad_norm": 0.423828125, "grad_norm_var": 0.004743687311808268, "learning_rate": 0.0001, "loss": 1.7529, "loss/crossentropy": 2.497701048851013, "loss/fcd": 1.5234375, "loss/idx": 8.0, "loss/logits": 0.22943350672721863, "step": 2529 }, { "epoch": 0.03777839165590306, "grad_norm": 0.484375, "grad_norm_var": 0.004732513427734375, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.669295310974121, "loss/fcd": 1.296875, "loss/idx": 8.0, "loss/logits": 0.1679273545742035, "step": 2530 }, { "epoch": 0.037793323826518045, "grad_norm": 0.44140625, "grad_norm_var": 0.004657236735026041, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.75239360332489, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.16649017482995987, "step": 2531 }, { "epoch": 0.03780825599713302, "grad_norm": 0.37109375, "grad_norm_var": 0.005236307779947917, "learning_rate": 0.0001, "loss": 1.4383, "loss/crossentropy": 2.7303972244262695, "loss/fcd": 1.26953125, "loss/idx": 8.0, "loss/logits": 0.16880206763744354, "step": 2532 }, { "epoch": 0.037823188167748004, "grad_norm": 0.4296875, "grad_norm_var": 0.004959980646769206, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.6886669397354126, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.1884622648358345, "step": 2533 }, { "epoch": 0.037838120338362986, "grad_norm": 0.54296875, "grad_norm_var": 0.004942576090494792, "learning_rate": 0.0001, "loss": 1.8629, "loss/crossentropy": 2.8441827297210693, "loss/fcd": 1.61328125, "loss/idx": 8.0, "loss/logits": 0.24960508942604065, "step": 2534 }, { "epoch": 0.03785305250897797, "grad_norm": 0.431640625, "grad_norm_var": 0.0050618489583333336, "learning_rate": 0.0001, "loss": 1.6098, "loss/crossentropy": 2.637432813644409, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.20356318354606628, "step": 2535 }, { "epoch": 0.03786798467959295, "grad_norm": 0.388671875, "grad_norm_var": 0.005510314305623373, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.718000054359436, "loss/fcd": 1.30859375, "loss/idx": 8.0, "loss/logits": 0.1745632067322731, "step": 2536 }, { "epoch": 0.03788291685020793, "grad_norm": 0.42578125, "grad_norm_var": 0.0055667718251546225, "learning_rate": 0.0001, "loss": 1.5632, "loss/crossentropy": 2.5870991945266724, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.17647727578878403, "step": 2537 }, { "epoch": 0.03789784902082291, "grad_norm": 0.46484375, "grad_norm_var": 0.005519851048787435, "learning_rate": 0.0001, "loss": 1.8245, "loss/crossentropy": 2.4887245893478394, "loss/fcd": 1.56640625, "loss/idx": 8.0, "loss/logits": 0.2580552399158478, "step": 2538 }, { "epoch": 0.03791278119143789, "grad_norm": 0.447265625, "grad_norm_var": 0.00552520751953125, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.716525077819824, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.1933377981185913, "step": 2539 }, { "epoch": 0.037927713362052876, "grad_norm": 0.40234375, "grad_norm_var": 0.0026501973470052084, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.378761410713196, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.2194046452641487, "step": 2540 }, { "epoch": 0.03794264553266786, "grad_norm": 0.42578125, "grad_norm_var": 0.002388445536295573, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.3840073347091675, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.17356640100479126, "step": 2541 }, { "epoch": 0.03795757770328284, "grad_norm": 0.431640625, "grad_norm_var": 0.0015523115793863933, "learning_rate": 0.0001, "loss": 1.7525, "loss/crossentropy": 2.59433114528656, "loss/fcd": 1.5234375, "loss/idx": 8.0, "loss/logits": 0.22901389002799988, "step": 2542 }, { "epoch": 0.03797250987389782, "grad_norm": 0.359375, "grad_norm_var": 0.0018984476725260417, "learning_rate": 0.0001, "loss": 1.5612, "loss/crossentropy": 2.4597238302230835, "loss/fcd": 1.3671875, "loss/idx": 8.0, "loss/logits": 0.1939757913351059, "step": 2543 }, { "epoch": 0.0379874420445128, "grad_norm": 0.423828125, "grad_norm_var": 0.0018930435180664062, "learning_rate": 0.0001, "loss": 1.5681, "loss/crossentropy": 2.7385295629501343, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.18922501057386398, "step": 2544 }, { "epoch": 0.03800237421512778, "grad_norm": 0.41796875, "grad_norm_var": 0.0019007205963134765, "learning_rate": 0.0001, "loss": 1.6354, "loss/crossentropy": 2.457058072090149, "loss/fcd": 1.4375, "loss/idx": 8.0, "loss/logits": 0.19791793823242188, "step": 2545 }, { "epoch": 0.038017306385742765, "grad_norm": 0.37890625, "grad_norm_var": 0.0018389225006103516, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.5022999048233032, "loss/fcd": 1.28515625, "loss/idx": 8.0, "loss/logits": 0.17506501823663712, "step": 2546 }, { "epoch": 0.03803223855635775, "grad_norm": 0.423828125, "grad_norm_var": 0.00181732177734375, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.4486243724823, "loss/fcd": 1.453125, "loss/idx": 8.0, "loss/logits": 0.2078242227435112, "step": 2547 }, { "epoch": 0.03804717072697272, "grad_norm": 0.4375, "grad_norm_var": 0.0016346613566080729, "learning_rate": 0.0001, "loss": 1.5625, "loss/crossentropy": 2.5284982919692993, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.18753904104232788, "step": 2548 }, { "epoch": 0.038062102897587706, "grad_norm": 0.478515625, "grad_norm_var": 0.0018011569976806641, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.649232268333435, "loss/fcd": 1.45703125, "loss/idx": 8.0, "loss/logits": 0.21307353675365448, "step": 2549 }, { "epoch": 0.03807703506820269, "grad_norm": 0.365234375, "grad_norm_var": 0.0010996500651041666, "learning_rate": 0.0001, "loss": 1.4395, "loss/crossentropy": 2.285835385322571, "loss/fcd": 1.2734375, "loss/idx": 8.0, "loss/logits": 0.16609592735767365, "step": 2550 }, { "epoch": 0.03809196723881767, "grad_norm": 0.4453125, "grad_norm_var": 0.0011344750722249348, "learning_rate": 0.0001, "loss": 1.588, "loss/crossentropy": 2.602128267288208, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.18562406301498413, "step": 2551 }, { "epoch": 0.038106899409432654, "grad_norm": 0.435546875, "grad_norm_var": 0.0010772546132405598, "learning_rate": 0.0001, "loss": 1.604, "loss/crossentropy": 2.5227988958358765, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.20554125308990479, "step": 2552 }, { "epoch": 0.03812183158004764, "grad_norm": 0.57421875, "grad_norm_var": 0.0025147596995035806, "learning_rate": 0.0001, "loss": 1.8405, "loss/crossentropy": 2.605988621711731, "loss/fcd": 1.609375, "loss/idx": 8.0, "loss/logits": 0.23112721741199493, "step": 2553 }, { "epoch": 0.03813676375066261, "grad_norm": 0.51953125, "grad_norm_var": 0.0029411156972249347, "learning_rate": 0.0001, "loss": 1.8287, "loss/crossentropy": 2.543513059616089, "loss/fcd": 1.5859375, "loss/idx": 8.0, "loss/logits": 0.2427598536014557, "step": 2554 }, { "epoch": 0.038151695921277595, "grad_norm": 0.384765625, "grad_norm_var": 0.0030865828196207684, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.2160935401916504, "loss/fcd": 1.4453125, "loss/idx": 8.0, "loss/logits": 0.19837912917137146, "step": 2555 }, { "epoch": 0.03816662809189258, "grad_norm": 0.546875, "grad_norm_var": 0.0038299401601155597, "learning_rate": 0.0001, "loss": 1.6286, "loss/crossentropy": 2.7437673807144165, "loss/fcd": 1.42578125, "loss/idx": 8.0, "loss/logits": 0.20277869701385498, "step": 2556 }, { "epoch": 0.03818156026250756, "grad_norm": 0.373046875, "grad_norm_var": 0.0041076024373372395, "learning_rate": 0.0001, "loss": 1.4209, "loss/crossentropy": 2.6492961645126343, "loss/fcd": 1.2578125, "loss/idx": 8.0, "loss/logits": 0.16308727115392685, "step": 2557 }, { "epoch": 0.03819649243312254, "grad_norm": 0.52734375, "grad_norm_var": 0.0046083927154541016, "learning_rate": 0.0001, "loss": 1.6382, "loss/crossentropy": 2.6905325651168823, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.2162850946187973, "step": 2558 }, { "epoch": 0.03821142460373752, "grad_norm": 0.4296875, "grad_norm_var": 0.004131174087524414, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.576603889465332, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.17774121463298798, "step": 2559 }, { "epoch": 0.0382263567743525, "grad_norm": 0.447265625, "grad_norm_var": 0.004091119766235352, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.2701427936553955, "loss/fcd": 1.40234375, "loss/idx": 8.0, "loss/logits": 0.16697486490011215, "step": 2560 }, { "epoch": 0.038241288944967484, "grad_norm": 0.421875, "grad_norm_var": 0.004075860977172852, "learning_rate": 0.0001, "loss": 1.4414, "loss/crossentropy": 2.502422332763672, "loss/fcd": 1.2890625, "loss/idx": 8.0, "loss/logits": 0.1523171216249466, "step": 2561 }, { "epoch": 0.03825622111558247, "grad_norm": 0.48828125, "grad_norm_var": 0.00379637082417806, "learning_rate": 0.0001, "loss": 1.6005, "loss/crossentropy": 2.589791178703308, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.19426555931568146, "step": 2562 }, { "epoch": 0.03827115328619745, "grad_norm": 0.42578125, "grad_norm_var": 0.003788185119628906, "learning_rate": 0.0001, "loss": 1.6878, "loss/crossentropy": 2.654357433319092, "loss/fcd": 1.48046875, "loss/idx": 8.0, "loss/logits": 0.20731588453054428, "step": 2563 }, { "epoch": 0.03828608545681243, "grad_norm": 0.60546875, "grad_norm_var": 0.0051305135091145836, "learning_rate": 0.0001, "loss": 1.7633, "loss/crossentropy": 2.2747687101364136, "loss/fcd": 1.53515625, "loss/idx": 8.0, "loss/logits": 0.2281356379389763, "step": 2564 }, { "epoch": 0.03830101762742741, "grad_norm": 0.478515625, "grad_norm_var": 0.0051305135091145836, "learning_rate": 0.0001, "loss": 1.5427, "loss/crossentropy": 2.715540051460266, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.1833612397313118, "step": 2565 }, { "epoch": 0.03831594979804239, "grad_norm": 0.423828125, "grad_norm_var": 0.00455163319905599, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.6926647424697876, "loss/fcd": 1.31640625, "loss/idx": 8.0, "loss/logits": 0.17612801492214203, "step": 2566 }, { "epoch": 0.038330881968657374, "grad_norm": 0.447265625, "grad_norm_var": 0.004545323053995768, "learning_rate": 0.0001, "loss": 1.6071, "loss/crossentropy": 2.5738861560821533, "loss/fcd": 1.421875, "loss/idx": 8.0, "loss/logits": 0.1852003112435341, "step": 2567 }, { "epoch": 0.038345814139272356, "grad_norm": 0.3671875, "grad_norm_var": 0.005156707763671875, "learning_rate": 0.0001, "loss": 1.6018, "loss/crossentropy": 2.3879783153533936, "loss/fcd": 1.40625, "loss/idx": 8.0, "loss/logits": 0.19556927680969238, "step": 2568 }, { "epoch": 0.03836074630988734, "grad_norm": 0.390625, "grad_norm_var": 0.004621823628743489, "learning_rate": 0.0001, "loss": 1.6018, "loss/crossentropy": 2.526431083679199, "loss/fcd": 1.41796875, "loss/idx": 8.0, "loss/logits": 0.1838557869195938, "step": 2569 }, { "epoch": 0.038375678480502315, "grad_norm": 0.41015625, "grad_norm_var": 0.004426002502441406, "learning_rate": 0.0001, "loss": 1.5505, "loss/crossentropy": 2.646073818206787, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18723712861537933, "step": 2570 }, { "epoch": 0.0383906106511173, "grad_norm": 0.40234375, "grad_norm_var": 0.004297113418579102, "learning_rate": 0.0001, "loss": 1.5894, "loss/crossentropy": 2.514255404472351, "loss/fcd": 1.38671875, "loss/idx": 8.0, "loss/logits": 0.20268169790506363, "step": 2571 }, { "epoch": 0.03840554282173228, "grad_norm": 0.455078125, "grad_norm_var": 0.00362701416015625, "learning_rate": 0.0001, "loss": 1.7396, "loss/crossentropy": 2.7296959161758423, "loss/fcd": 1.49609375, "loss/idx": 8.0, "loss/logits": 0.24349959194660187, "step": 2572 }, { "epoch": 0.03842047499234726, "grad_norm": 0.416015625, "grad_norm_var": 0.0033395767211914064, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.598563075065613, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.17820407450199127, "step": 2573 }, { "epoch": 0.038435407162962246, "grad_norm": 0.423828125, "grad_norm_var": 0.0028872013092041014, "learning_rate": 0.0001, "loss": 1.6201, "loss/crossentropy": 2.644793391227722, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.20604727417230606, "step": 2574 }, { "epoch": 0.03845033933357723, "grad_norm": 0.40234375, "grad_norm_var": 0.002969980239868164, "learning_rate": 0.0001, "loss": 1.5528, "loss/crossentropy": 2.559776544570923, "loss/fcd": 1.37109375, "loss/idx": 8.0, "loss/logits": 0.18165642768144608, "step": 2575 }, { "epoch": 0.038465271504192204, "grad_norm": 0.3984375, "grad_norm_var": 0.0030577977498372397, "learning_rate": 0.0001, "loss": 1.6649, "loss/crossentropy": 2.3485809564590454, "loss/fcd": 1.4609375, "loss/idx": 8.0, "loss/logits": 0.20395320653915405, "step": 2576 }, { "epoch": 0.03848020367480719, "grad_norm": 0.421875, "grad_norm_var": 0.0030577977498372397, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.5326364040374756, "loss/fcd": 1.30078125, "loss/idx": 8.0, "loss/logits": 0.17912371456623077, "step": 2577 }, { "epoch": 0.03849513584542217, "grad_norm": 0.5703125, "grad_norm_var": 0.004063161214192709, "learning_rate": 0.0001, "loss": 1.6337, "loss/crossentropy": 2.4538668394088745, "loss/fcd": 1.4140625, "loss/idx": 8.0, "loss/logits": 0.2196699157357216, "step": 2578 }, { "epoch": 0.03851006801603715, "grad_norm": 0.4453125, "grad_norm_var": 0.004050127665201823, "learning_rate": 0.0001, "loss": 1.6756, "loss/crossentropy": 2.457972526550293, "loss/fcd": 1.44921875, "loss/idx": 8.0, "loss/logits": 0.22642634809017181, "step": 2579 }, { "epoch": 0.038525000186652135, "grad_norm": 0.447265625, "grad_norm_var": 0.0021485487620035806, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.6162463426589966, "loss/fcd": 1.33203125, "loss/idx": 8.0, "loss/logits": 0.17608024924993515, "step": 2580 }, { "epoch": 0.03853993235726711, "grad_norm": 0.453125, "grad_norm_var": 0.0020289103190104167, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.4926345348358154, "loss/fcd": 1.265625, "loss/idx": 8.0, "loss/logits": 0.15779562294483185, "step": 2581 }, { "epoch": 0.03855486452788209, "grad_norm": 0.44921875, "grad_norm_var": 0.0020493666330973306, "learning_rate": 0.0001, "loss": 1.5375, "loss/crossentropy": 2.6681625843048096, "loss/fcd": 1.359375, "loss/idx": 8.0, "loss/logits": 0.17808111011981964, "step": 2582 }, { "epoch": 0.038569796698497076, "grad_norm": 0.419921875, "grad_norm_var": 0.002037795384724935, "learning_rate": 0.0001, "loss": 1.5265, "loss/crossentropy": 2.849905848503113, "loss/fcd": 1.3515625, "loss/idx": 8.0, "loss/logits": 0.1749521717429161, "step": 2583 }, { "epoch": 0.03858472886911206, "grad_norm": 0.412109375, "grad_norm_var": 0.001790301005045573, "learning_rate": 0.0001, "loss": 1.5546, "loss/crossentropy": 2.668967366218567, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.1913658007979393, "step": 2584 }, { "epoch": 0.03859966103972704, "grad_norm": 0.42578125, "grad_norm_var": 0.0016718546549479166, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.5603634119033813, "loss/fcd": 1.2734375, "loss/idx": 8.0, "loss/logits": 0.15820304304361343, "step": 2585 }, { "epoch": 0.038614593210342024, "grad_norm": 0.4296875, "grad_norm_var": 0.0016321182250976563, "learning_rate": 0.0001, "loss": 1.5615, "loss/crossentropy": 2.6238280534744263, "loss/fcd": 1.37890625, "loss/idx": 8.0, "loss/logits": 0.18257231265306473, "step": 2586 }, { "epoch": 0.038629525380957, "grad_norm": 0.439453125, "grad_norm_var": 0.0015526930491129556, "learning_rate": 0.0001, "loss": 1.5045, "loss/crossentropy": 2.5588778257369995, "loss/fcd": 1.33984375, "loss/idx": 8.0, "loss/logits": 0.1646304428577423, "step": 2587 }, { "epoch": 0.03864445755157198, "grad_norm": 0.45703125, "grad_norm_var": 0.0015573501586914062, "learning_rate": 0.0001, "loss": 1.5419, "loss/crossentropy": 2.3399779200553894, "loss/fcd": 1.375, "loss/idx": 8.0, "loss/logits": 0.16687344759702682, "step": 2588 }, { "epoch": 0.038659389722186965, "grad_norm": 0.421875, "grad_norm_var": 0.0015421390533447265, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.705562472343445, "loss/fcd": 1.30078125, "loss/idx": 8.0, "loss/logits": 0.1738956943154335, "step": 2589 }, { "epoch": 0.03867432189280195, "grad_norm": 0.4296875, "grad_norm_var": 0.001532745361328125, "learning_rate": 0.0001, "loss": 1.8828, "loss/crossentropy": 2.1438353061676025, "loss/fcd": 1.640625, "loss/idx": 8.0, "loss/logits": 0.24215079843997955, "step": 2590 }, { "epoch": 0.03868925406341693, "grad_norm": 0.380859375, "grad_norm_var": 0.0016664981842041016, "learning_rate": 0.0001, "loss": 1.3912, "loss/crossentropy": 2.7279763221740723, "loss/fcd": 1.23828125, "loss/idx": 8.0, "loss/logits": 0.15296506881713867, "step": 2591 }, { "epoch": 0.03870418623403191, "grad_norm": 0.4296875, "grad_norm_var": 0.0015642642974853516, "learning_rate": 0.0001, "loss": 1.7058, "loss/crossentropy": 2.657339334487915, "loss/fcd": 1.4765625, "loss/idx": 8.0, "loss/logits": 0.22920557856559753, "step": 2592 }, { "epoch": 0.03871911840464689, "grad_norm": 0.37890625, "grad_norm_var": 0.0017810662587483723, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.567889094352722, "loss/fcd": 1.3203125, "loss/idx": 8.0, "loss/logits": 0.17915304750204086, "step": 2593 }, { "epoch": 0.03873405057526187, "grad_norm": 0.443359375, "grad_norm_var": 0.0005299250284830729, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.3205610513687134, "loss/fcd": 1.48828125, "loss/idx": 8.0, "loss/logits": 0.19831962883472443, "step": 2594 }, { "epoch": 0.038748982745876855, "grad_norm": 0.3671875, "grad_norm_var": 0.0007410049438476562, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.277058482170105, "loss/fcd": 1.2734375, "loss/idx": 8.0, "loss/logits": 0.15301886200904846, "step": 2595 }, { "epoch": 0.03876391491649184, "grad_norm": 0.384765625, "grad_norm_var": 0.0007918675740559896, "learning_rate": 0.0001, "loss": 1.3504, "loss/crossentropy": 2.496413230895996, "loss/fcd": 1.203125, "loss/idx": 8.0, "loss/logits": 0.14727401733398438, "step": 2596 }, { "epoch": 0.03877884708710682, "grad_norm": 0.4921875, "grad_norm_var": 0.0010588963826497395, "learning_rate": 0.0001, "loss": 1.7257, "loss/crossentropy": 2.5532642602920532, "loss/fcd": 1.49609375, "loss/idx": 8.0, "loss/logits": 0.2296057641506195, "step": 2597 }, { "epoch": 0.038793779257721796, "grad_norm": 0.42578125, "grad_norm_var": 0.0010100682576497397, "learning_rate": 0.0001, "loss": 1.6023, "loss/crossentropy": 2.6459460258483887, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.20383698493242264, "step": 2598 }, { "epoch": 0.03880871142833678, "grad_norm": 0.392578125, "grad_norm_var": 0.001061248779296875, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.6895079612731934, "loss/fcd": 1.3984375, "loss/idx": 8.0, "loss/logits": 0.19134879857301712, "step": 2599 }, { "epoch": 0.03882364359895176, "grad_norm": 0.47265625, "grad_norm_var": 0.001231241226196289, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.6998510360717773, "loss/fcd": 1.32421875, "loss/idx": 8.0, "loss/logits": 0.1855810210108757, "step": 2600 }, { "epoch": 0.038838575769566744, "grad_norm": 0.416015625, "grad_norm_var": 0.0012338638305664062, "learning_rate": 0.0001, "loss": 1.5523, "loss/crossentropy": 2.5648155212402344, "loss/fcd": 1.36328125, "loss/idx": 8.0, "loss/logits": 0.18897220492362976, "step": 2601 }, { "epoch": 0.03885350794018173, "grad_norm": 0.41015625, "grad_norm_var": 0.0012392679850260417, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.5355254411697388, "loss/fcd": 1.3046875, "loss/idx": 8.0, "loss/logits": 0.18377254903316498, "step": 2602 }, { "epoch": 0.03886844011079671, "grad_norm": 0.2490234375, "grad_norm_var": 0.003047017256418864, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.575156331062317, "loss/fcd": 1.28125, "loss/idx": 8.5, "loss/logits": 0.18592246621847153, "step": 2603 }, { "epoch": 0.038883372281411685, "grad_norm": 0.298828125, "grad_norm_var": 0.0036083499590555826, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.4663561582565308, "loss/fcd": 1.25, "loss/idx": 8.5, "loss/logits": 0.19807378202676773, "step": 2604 }, { "epoch": 0.03889830445202667, "grad_norm": 0.283203125, "grad_norm_var": 0.004398310184478759, "learning_rate": 0.0001, "loss": 1.5871, "loss/crossentropy": 2.7749844789505005, "loss/fcd": 1.37109375, "loss/idx": 8.5, "loss/logits": 0.2159847915172577, "step": 2605 }, { "epoch": 0.03891323662264165, "grad_norm": 0.251953125, "grad_norm_var": 0.0054541865984598795, "learning_rate": 0.0001, "loss": 1.5561, "loss/crossentropy": 2.3115190267562866, "loss/fcd": 1.33984375, "loss/idx": 8.5, "loss/logits": 0.21621468663215637, "step": 2606 }, { "epoch": 0.03892816879325663, "grad_norm": 0.26953125, "grad_norm_var": 0.0062134067217508955, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.5664433240890503, "loss/fcd": 1.1953125, "loss/idx": 8.5, "loss/logits": 0.18892759084701538, "step": 2607 }, { "epoch": 0.038943100963871616, "grad_norm": 0.302734375, "grad_norm_var": 0.006258865197499593, "learning_rate": 0.0001, "loss": 1.5463, "loss/crossentropy": 2.3785523176193237, "loss/fcd": 1.3515625, "loss/idx": 8.5, "loss/logits": 0.19470161199569702, "step": 2608 }, { "epoch": 0.03895803313448659, "grad_norm": 0.375, "grad_norm_var": 0.0062525391578674315, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.48114550113678, "loss/fcd": 1.328125, "loss/idx": 8.5, "loss/logits": 0.1925334334373474, "step": 2609 }, { "epoch": 0.038972965305101574, "grad_norm": 0.2890625, "grad_norm_var": 0.006121949354807536, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.666449785232544, "loss/fcd": 1.1953125, "loss/idx": 8.5, "loss/logits": 0.16479192674160004, "step": 2610 }, { "epoch": 0.03898789747571656, "grad_norm": 0.3046875, "grad_norm_var": 0.006264873345692952, "learning_rate": 0.0001, "loss": 1.559, "loss/crossentropy": 2.6002973318099976, "loss/fcd": 1.3515625, "loss/idx": 8.5, "loss/logits": 0.20740759372711182, "step": 2611 }, { "epoch": 0.03900282964633154, "grad_norm": 0.275390625, "grad_norm_var": 0.006522111097971598, "learning_rate": 0.0001, "loss": 1.4814, "loss/crossentropy": 2.836969017982483, "loss/fcd": 1.26953125, "loss/idx": 8.5, "loss/logits": 0.21182217448949814, "step": 2612 }, { "epoch": 0.03901776181694652, "grad_norm": 0.244140625, "grad_norm_var": 0.005476470788319906, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.276855945587158, "loss/fcd": 1.2421875, "loss/idx": 8.5, "loss/logits": 0.18836060166358948, "step": 2613 }, { "epoch": 0.039032693987561505, "grad_norm": 0.2890625, "grad_norm_var": 0.004876768589019776, "learning_rate": 0.0001, "loss": 1.3708, "loss/crossentropy": 2.3823397159576416, "loss/fcd": 1.20703125, "loss/idx": 8.5, "loss/logits": 0.16373077780008316, "step": 2614 }, { "epoch": 0.03904762615817648, "grad_norm": 0.302734375, "grad_norm_var": 0.004514849185943604, "learning_rate": 0.0001, "loss": 1.4856, "loss/crossentropy": 2.5655856132507324, "loss/fcd": 1.30078125, "loss/idx": 8.5, "loss/logits": 0.18478816002607346, "step": 2615 }, { "epoch": 0.039062558328791464, "grad_norm": 0.2490234375, "grad_norm_var": 0.0029287815093994142, "learning_rate": 0.0001, "loss": 1.3881, "loss/crossentropy": 2.714416980743408, "loss/fcd": 1.2109375, "loss/idx": 8.5, "loss/logits": 0.17717060446739197, "step": 2616 }, { "epoch": 0.039077490499406446, "grad_norm": 0.263671875, "grad_norm_var": 0.0020361423492431642, "learning_rate": 0.0001, "loss": 1.3905, "loss/crossentropy": 2.61183500289917, "loss/fcd": 1.20703125, "loss/idx": 8.5, "loss/logits": 0.1834862157702446, "step": 2617 }, { "epoch": 0.03909242267002143, "grad_norm": 0.291015625, "grad_norm_var": 0.0010326385498046875, "learning_rate": 0.0001, "loss": 1.4496, "loss/crossentropy": 2.533151149749756, "loss/fcd": 1.27734375, "loss/idx": 8.5, "loss/logits": 0.17225409299135208, "step": 2618 }, { "epoch": 0.03910735484063641, "grad_norm": 0.267578125, "grad_norm_var": 0.0009683887163798014, "learning_rate": 0.0001, "loss": 1.3091, "loss/crossentropy": 2.6718605756759644, "loss/fcd": 1.1484375, "loss/idx": 8.5, "loss/logits": 0.16063150763511658, "step": 2619 }, { "epoch": 0.03912228701125139, "grad_norm": 0.263671875, "grad_norm_var": 0.0009801189104715984, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.667183041572571, "loss/fcd": 1.234375, "loss/idx": 8.5, "loss/logits": 0.21304906904697418, "step": 2620 }, { "epoch": 0.03913721918186637, "grad_norm": 0.26171875, "grad_norm_var": 0.001007393995920817, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.6071012020111084, "loss/fcd": 1.23046875, "loss/idx": 8.5, "loss/logits": 0.17445272952318192, "step": 2621 }, { "epoch": 0.03915215135248135, "grad_norm": 0.392578125, "grad_norm_var": 0.0016928950945536295, "learning_rate": 0.0001, "loss": 1.7538, "loss/crossentropy": 2.5166945457458496, "loss/fcd": 1.5, "loss/idx": 8.5, "loss/logits": 0.2537681460380554, "step": 2622 }, { "epoch": 0.039167083523096335, "grad_norm": 0.349609375, "grad_norm_var": 0.0018740614255269368, "learning_rate": 0.0001, "loss": 1.6235, "loss/crossentropy": 2.813810110092163, "loss/fcd": 1.421875, "loss/idx": 8.5, "loss/logits": 0.20159534364938736, "step": 2623 }, { "epoch": 0.03918201569371132, "grad_norm": 0.26953125, "grad_norm_var": 0.001909188429514567, "learning_rate": 0.0001, "loss": 1.3955, "loss/crossentropy": 2.729296922683716, "loss/fcd": 1.21875, "loss/idx": 8.5, "loss/logits": 0.17679324001073837, "step": 2624 }, { "epoch": 0.0391969478643263, "grad_norm": 0.283203125, "grad_norm_var": 0.0014325737953186034, "learning_rate": 0.0001, "loss": 1.5158, "loss/crossentropy": 2.5037161111831665, "loss/fcd": 1.3125, "loss/idx": 8.5, "loss/logits": 0.2033371925354004, "step": 2625 }, { "epoch": 0.03921188003494128, "grad_norm": 0.3046875, "grad_norm_var": 0.0014515201250712077, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.642889976501465, "loss/fcd": 1.21484375, "loss/idx": 8.5, "loss/logits": 0.18251638859510422, "step": 2626 }, { "epoch": 0.03922681220555626, "grad_norm": 0.265625, "grad_norm_var": 0.0014613747596740722, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.527453303337097, "loss/fcd": 1.3203125, "loss/idx": 8.5, "loss/logits": 0.1947341486811638, "step": 2627 }, { "epoch": 0.03924174437617124, "grad_norm": 0.318359375, "grad_norm_var": 0.0015169739723205566, "learning_rate": 0.0001, "loss": 1.5261, "loss/crossentropy": 2.61286199092865, "loss/fcd": 1.3125, "loss/idx": 8.5, "loss/logits": 0.21358194947242737, "step": 2628 }, { "epoch": 0.039256676546786225, "grad_norm": 0.267578125, "grad_norm_var": 0.001412642002105713, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.5937705039978027, "loss/fcd": 1.33984375, "loss/idx": 8.5, "loss/logits": 0.20346547663211823, "step": 2629 }, { "epoch": 0.03927160871740121, "grad_norm": 0.255859375, "grad_norm_var": 0.001485598087310791, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.5798020362854004, "loss/fcd": 1.27734375, "loss/idx": 8.5, "loss/logits": 0.1906125396490097, "step": 2630 }, { "epoch": 0.03928654088801618, "grad_norm": 0.28515625, "grad_norm_var": 0.0014701485633850098, "learning_rate": 0.0001, "loss": 1.4944, "loss/crossentropy": 2.7707003355026245, "loss/fcd": 1.29296875, "loss/idx": 8.5, "loss/logits": 0.2014707401394844, "step": 2631 }, { "epoch": 0.039301473058631166, "grad_norm": 0.27734375, "grad_norm_var": 0.0013776143391927083, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.6118104457855225, "loss/fcd": 1.390625, "loss/idx": 8.5, "loss/logits": 0.22886217385530472, "step": 2632 }, { "epoch": 0.03931640522924615, "grad_norm": 0.27734375, "grad_norm_var": 0.0013439019521077475, "learning_rate": 0.0001, "loss": 1.3173, "loss/crossentropy": 2.570255160331726, "loss/fcd": 1.16015625, "loss/idx": 8.5, "loss/logits": 0.15718621015548706, "step": 2633 }, { "epoch": 0.03933133739986113, "grad_norm": 0.296875, "grad_norm_var": 0.0013472874959309896, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.58853542804718, "loss/fcd": 1.296875, "loss/idx": 8.5, "loss/logits": 0.20257048308849335, "step": 2634 }, { "epoch": 0.039346269570476114, "grad_norm": 0.298828125, "grad_norm_var": 0.001315752665201823, "learning_rate": 0.0001, "loss": 1.3879, "loss/crossentropy": 2.321781635284424, "loss/fcd": 1.21484375, "loss/idx": 8.5, "loss/logits": 0.17303910851478577, "step": 2635 }, { "epoch": 0.0393612017410911, "grad_norm": 0.29296875, "grad_norm_var": 0.0012597242991129558, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.715195655822754, "loss/fcd": 1.19140625, "loss/idx": 8.5, "loss/logits": 0.17697061598300934, "step": 2636 }, { "epoch": 0.03937613391170607, "grad_norm": 0.296875, "grad_norm_var": 0.0011876265207926432, "learning_rate": 0.0001, "loss": 1.4737, "loss/crossentropy": 2.581777811050415, "loss/fcd": 1.2890625, "loss/idx": 8.5, "loss/logits": 0.18461082130670547, "step": 2637 }, { "epoch": 0.039391066082321055, "grad_norm": 0.330078125, "grad_norm_var": 0.0006250858306884766, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.6477479934692383, "loss/fcd": 1.1875, "loss/idx": 8.5, "loss/logits": 0.16979613155126572, "step": 2638 }, { "epoch": 0.03940599825293604, "grad_norm": 0.376953125, "grad_norm_var": 0.0008823235829671223, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.59823739528656, "loss/fcd": 1.26953125, "loss/idx": 8.5, "loss/logits": 0.19297759979963303, "step": 2639 }, { "epoch": 0.03942093042355102, "grad_norm": 0.279296875, "grad_norm_var": 0.0008569717407226563, "learning_rate": 0.0001, "loss": 1.5197, "loss/crossentropy": 2.7381982803344727, "loss/fcd": 1.30078125, "loss/idx": 8.5, "loss/logits": 0.21889273077249527, "step": 2640 }, { "epoch": 0.039435862594166, "grad_norm": 0.265625, "grad_norm_var": 0.0009020328521728516, "learning_rate": 0.0001, "loss": 1.4509, "loss/crossentropy": 2.4950894117355347, "loss/fcd": 1.265625, "loss/idx": 8.5, "loss/logits": 0.1853085309267044, "step": 2641 }, { "epoch": 0.03945079476478098, "grad_norm": 0.302734375, "grad_norm_var": 0.0008992513020833333, "learning_rate": 0.0001, "loss": 1.5085, "loss/crossentropy": 2.504630446434021, "loss/fcd": 1.3203125, "loss/idx": 8.5, "loss/logits": 0.1882021352648735, "step": 2642 }, { "epoch": 0.03946572693539596, "grad_norm": 0.306640625, "grad_norm_var": 0.0008548577626546224, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.6627880334854126, "loss/fcd": 1.2265625, "loss/idx": 8.5, "loss/logits": 0.17044247686862946, "step": 2643 }, { "epoch": 0.039480659106010944, "grad_norm": 0.298828125, "grad_norm_var": 0.000819253921508789, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.54423725605011, "loss/fcd": 1.19921875, "loss/idx": 8.5, "loss/logits": 0.16033007949590683, "step": 2644 }, { "epoch": 0.03949559127662593, "grad_norm": 0.267578125, "grad_norm_var": 0.000819253921508789, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.297085762023926, "loss/fcd": 1.23828125, "loss/idx": 8.5, "loss/logits": 0.16451385617256165, "step": 2645 }, { "epoch": 0.03951052344724091, "grad_norm": 0.3203125, "grad_norm_var": 0.000748443603515625, "learning_rate": 0.0001, "loss": 1.5966, "loss/crossentropy": 2.518720030784607, "loss/fcd": 1.3828125, "loss/idx": 8.5, "loss/logits": 0.21382658183574677, "step": 2646 }, { "epoch": 0.03952545561785589, "grad_norm": 0.33984375, "grad_norm_var": 0.0008392333984375, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.5892590284347534, "loss/fcd": 1.3125, "loss/idx": 8.5, "loss/logits": 0.18688871711492538, "step": 2647 }, { "epoch": 0.03954038778847087, "grad_norm": 0.27734375, "grad_norm_var": 0.0008392333984375, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.669550657272339, "loss/fcd": 1.234375, "loss/idx": 8.5, "loss/logits": 0.1792566031217575, "step": 2648 }, { "epoch": 0.03955531995908585, "grad_norm": 0.326171875, "grad_norm_var": 0.0008292992909749349, "learning_rate": 0.0001, "loss": 1.6103, "loss/crossentropy": 2.589750289916992, "loss/fcd": 1.41015625, "loss/idx": 8.5, "loss/logits": 0.2001681923866272, "step": 2649 }, { "epoch": 0.039570252129700834, "grad_norm": 0.2890625, "grad_norm_var": 0.0008413791656494141, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.7801259756088257, "loss/fcd": 1.2890625, "loss/idx": 8.5, "loss/logits": 0.18233658373355865, "step": 2650 }, { "epoch": 0.039585184300315816, "grad_norm": 0.2333984375, "grad_norm_var": 0.0011568665504455566, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.3954578638076782, "loss/fcd": 1.19140625, "loss/idx": 8.5, "loss/logits": 0.16124705225229263, "step": 2651 }, { "epoch": 0.0396001164709308, "grad_norm": 0.283203125, "grad_norm_var": 0.0011722842852274578, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.4745770692825317, "loss/fcd": 1.328125, "loss/idx": 8.5, "loss/logits": 0.18686656653881073, "step": 2652 }, { "epoch": 0.03961504864154578, "grad_norm": 0.458984375, "grad_norm_var": 0.0027553836504618325, "learning_rate": 0.0001, "loss": 1.6944, "loss/crossentropy": 2.293736696243286, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.23349064588546753, "step": 2653 }, { "epoch": 0.03962998081216076, "grad_norm": 0.796875, "grad_norm_var": 0.017639092604319253, "learning_rate": 0.0001, "loss": 1.7994, "loss/crossentropy": 2.751600742340088, "loss/fcd": 1.55078125, "loss/idx": 9.0, "loss/logits": 0.24861685931682587, "step": 2654 }, { "epoch": 0.03964491298277574, "grad_norm": 0.484375, "grad_norm_var": 0.01890493631362915, "learning_rate": 0.0001, "loss": 1.5601, "loss/crossentropy": 2.5182117223739624, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.1850782334804535, "step": 2655 }, { "epoch": 0.03965984515339072, "grad_norm": 0.494140625, "grad_norm_var": 0.0198892871538798, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 2.5398313999176025, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.1987379491329193, "step": 2656 }, { "epoch": 0.039674777324005706, "grad_norm": 0.41796875, "grad_norm_var": 0.019441727797190347, "learning_rate": 0.0001, "loss": 1.5123, "loss/crossentropy": 2.6441643238067627, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.17632798850536346, "step": 2657 }, { "epoch": 0.03968970949462069, "grad_norm": 0.46484375, "grad_norm_var": 0.019660723209381104, "learning_rate": 0.0001, "loss": 1.657, "loss/crossentropy": 2.7174044847488403, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.21563098579645157, "step": 2658 }, { "epoch": 0.039704641665235664, "grad_norm": 0.451171875, "grad_norm_var": 0.019577213128407798, "learning_rate": 0.0001, "loss": 1.6138, "loss/crossentropy": 2.4868476390838623, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.1997820883989334, "step": 2659 }, { "epoch": 0.03971957383585065, "grad_norm": 0.41796875, "grad_norm_var": 0.01905170679092407, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.646156072616577, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.1921032890677452, "step": 2660 }, { "epoch": 0.03973450600646563, "grad_norm": 0.412109375, "grad_norm_var": 0.01789785623550415, "learning_rate": 0.0001, "loss": 1.5171, "loss/crossentropy": 2.5003496408462524, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18111955374479294, "step": 2661 }, { "epoch": 0.03974943817708061, "grad_norm": 0.498046875, "grad_norm_var": 0.017883392175038655, "learning_rate": 0.0001, "loss": 1.7831, "loss/crossentropy": 2.4437875747680664, "loss/fcd": 1.546875, "loss/idx": 9.0, "loss/logits": 0.2362736165523529, "step": 2662 }, { "epoch": 0.039764370347695595, "grad_norm": 0.419921875, "grad_norm_var": 0.01747804880142212, "learning_rate": 0.0001, "loss": 1.5572, "loss/crossentropy": 2.6105082035064697, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.18605820834636688, "step": 2663 }, { "epoch": 0.03977930251831058, "grad_norm": 0.455078125, "grad_norm_var": 0.01606346368789673, "learning_rate": 0.0001, "loss": 1.585, "loss/crossentropy": 2.4350595474243164, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.19440454244613647, "step": 2664 }, { "epoch": 0.03979423468892555, "grad_norm": 0.458984375, "grad_norm_var": 0.015301477909088135, "learning_rate": 0.0001, "loss": 1.8132, "loss/crossentropy": 2.356851816177368, "loss/fcd": 1.5703125, "loss/idx": 9.0, "loss/logits": 0.24287424236536026, "step": 2665 }, { "epoch": 0.039809166859540536, "grad_norm": 0.416015625, "grad_norm_var": 0.01375795602798462, "learning_rate": 0.0001, "loss": 1.7138, "loss/crossentropy": 2.456999182701111, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.22943846881389618, "step": 2666 }, { "epoch": 0.03982409903015552, "grad_norm": 0.431640625, "grad_norm_var": 0.01054991086324056, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.705851674079895, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20104067027568817, "step": 2667 }, { "epoch": 0.0398390312007705, "grad_norm": 0.408203125, "grad_norm_var": 0.008578475316365559, "learning_rate": 0.0001, "loss": 1.587, "loss/crossentropy": 2.4379669427871704, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.19246099889278412, "step": 2668 }, { "epoch": 0.039853963371385484, "grad_norm": 0.380859375, "grad_norm_var": 0.009052769343058268, "learning_rate": 0.0001, "loss": 1.569, "loss/crossentropy": 2.5265203714370728, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19789350777864456, "step": 2669 }, { "epoch": 0.03986889554200046, "grad_norm": 0.435546875, "grad_norm_var": 0.0011281331380208334, "learning_rate": 0.0001, "loss": 1.5528, "loss/crossentropy": 2.674806237220764, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.18172632902860641, "step": 2670 }, { "epoch": 0.03988382771261544, "grad_norm": 0.431640625, "grad_norm_var": 0.0009929498036702475, "learning_rate": 0.0001, "loss": 1.7293, "loss/crossentropy": 2.955367684364319, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.22927934676408768, "step": 2671 }, { "epoch": 0.039898759883230425, "grad_norm": 0.400390625, "grad_norm_var": 0.0008296807607014974, "learning_rate": 0.0001, "loss": 1.5597, "loss/crossentropy": 2.640317440032959, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20035617798566818, "step": 2672 }, { "epoch": 0.03991369205384541, "grad_norm": 0.451171875, "grad_norm_var": 0.0008396784464518229, "learning_rate": 0.0001, "loss": 1.7767, "loss/crossentropy": 2.602523922920227, "loss/fcd": 1.5234375, "loss/idx": 9.0, "loss/logits": 0.2532244473695755, "step": 2673 }, { "epoch": 0.03992862422446039, "grad_norm": 0.40234375, "grad_norm_var": 0.0008213678995768229, "learning_rate": 0.0001, "loss": 1.5787, "loss/crossentropy": 2.7410587072372437, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21153900772333145, "step": 2674 }, { "epoch": 0.03994355639507537, "grad_norm": 0.462890625, "grad_norm_var": 0.0008639017740885417, "learning_rate": 0.0001, "loss": 1.6641, "loss/crossentropy": 2.8765653371810913, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2148786336183548, "step": 2675 }, { "epoch": 0.03995848856569035, "grad_norm": 0.40234375, "grad_norm_var": 0.0009045918782552083, "learning_rate": 0.0001, "loss": 1.7014, "loss/crossentropy": 2.569037437438965, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.2131112813949585, "step": 2676 }, { "epoch": 0.03997342073630533, "grad_norm": 0.3984375, "grad_norm_var": 0.0009474277496337891, "learning_rate": 0.0001, "loss": 1.6771, "loss/crossentropy": 2.650820732116699, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.23571473360061646, "step": 2677 }, { "epoch": 0.039988352906920314, "grad_norm": 0.40234375, "grad_norm_var": 0.0006304423014322917, "learning_rate": 0.0001, "loss": 1.5108, "loss/crossentropy": 2.742453098297119, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18657661974430084, "step": 2678 }, { "epoch": 0.0400032850775353, "grad_norm": 0.419921875, "grad_norm_var": 0.0006304423014322917, "learning_rate": 0.0001, "loss": 1.613, "loss/crossentropy": 2.92600679397583, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.18330446630716324, "step": 2679 }, { "epoch": 0.04001821724815028, "grad_norm": 0.44921875, "grad_norm_var": 0.0006070295969645182, "learning_rate": 0.0001, "loss": 1.758, "loss/crossentropy": 2.2612446546554565, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.22674322128295898, "step": 2680 }, { "epoch": 0.040033149418765256, "grad_norm": 0.40625, "grad_norm_var": 0.000520769755045573, "learning_rate": 0.0001, "loss": 1.5545, "loss/crossentropy": 2.4203641414642334, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19121520221233368, "step": 2681 }, { "epoch": 0.04004808158938024, "grad_norm": 0.455078125, "grad_norm_var": 0.0006021499633789062, "learning_rate": 0.0001, "loss": 1.5861, "loss/crossentropy": 2.7455105781555176, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.19152191281318665, "step": 2682 }, { "epoch": 0.04006301375999522, "grad_norm": 0.380859375, "grad_norm_var": 0.0006922403971354167, "learning_rate": 0.0001, "loss": 1.559, "loss/crossentropy": 2.689688563346863, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.18793320655822754, "step": 2683 }, { "epoch": 0.040077945930610204, "grad_norm": 0.41015625, "grad_norm_var": 0.0006899356842041016, "learning_rate": 0.0001, "loss": 1.7541, "loss/crossentropy": 2.6422417163848877, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.2424018830060959, "step": 2684 }, { "epoch": 0.040092878101225186, "grad_norm": 0.5078125, "grad_norm_var": 0.0010670344034830728, "learning_rate": 0.0001, "loss": 1.6187, "loss/crossentropy": 2.823509097099304, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.20459182560443878, "step": 2685 }, { "epoch": 0.04010781027184017, "grad_norm": 0.5234375, "grad_norm_var": 0.0016614119211832681, "learning_rate": 0.0001, "loss": 1.7007, "loss/crossentropy": 2.623517394065857, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.20855662971735, "step": 2686 }, { "epoch": 0.040122742442455145, "grad_norm": 0.37109375, "grad_norm_var": 0.0018895467122395833, "learning_rate": 0.0001, "loss": 1.5295, "loss/crossentropy": 2.6044704914093018, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.1818469986319542, "step": 2687 }, { "epoch": 0.04013767461307013, "grad_norm": 0.490234375, "grad_norm_var": 0.002066485087076823, "learning_rate": 0.0001, "loss": 1.7245, "loss/crossentropy": 2.511703610420227, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.228400319814682, "step": 2688 }, { "epoch": 0.04015260678368511, "grad_norm": 0.48046875, "grad_norm_var": 0.002189747492472331, "learning_rate": 0.0001, "loss": 1.6713, "loss/crossentropy": 3.0065401792526245, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.22602692246437073, "step": 2689 }, { "epoch": 0.04016753895430009, "grad_norm": 0.4453125, "grad_norm_var": 0.002117013931274414, "learning_rate": 0.0001, "loss": 1.7298, "loss/crossentropy": 2.730826497077942, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.22980520129203796, "step": 2690 }, { "epoch": 0.040182471124915076, "grad_norm": 0.8828125, "grad_norm_var": 0.014539019266764323, "learning_rate": 0.0001, "loss": 1.7562, "loss/crossentropy": 2.3691927194595337, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.24057485163211823, "step": 2691 }, { "epoch": 0.04019740329553005, "grad_norm": 0.419921875, "grad_norm_var": 0.014413563410441081, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.6185015439987183, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.1694553643465042, "step": 2692 }, { "epoch": 0.040212335466145034, "grad_norm": 0.51953125, "grad_norm_var": 0.014251947402954102, "learning_rate": 0.0001, "loss": 1.814, "loss/crossentropy": 2.635100841522217, "loss/fcd": 1.59375, "loss/idx": 9.0, "loss/logits": 0.2202240377664566, "step": 2693 }, { "epoch": 0.04022726763676002, "grad_norm": 0.416015625, "grad_norm_var": 0.014135233561197917, "learning_rate": 0.0001, "loss": 1.4401, "loss/crossentropy": 2.7201433181762695, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1588825359940529, "step": 2694 }, { "epoch": 0.040242199807375, "grad_norm": 0.419921875, "grad_norm_var": 0.014135233561197917, "learning_rate": 0.0001, "loss": 1.4944, "loss/crossentropy": 2.5585933923721313, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1819174960255623, "step": 2695 }, { "epoch": 0.04025713197798998, "grad_norm": 0.46875, "grad_norm_var": 0.014095497131347657, "learning_rate": 0.0001, "loss": 1.6203, "loss/crossentropy": 2.6387170553207397, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.2062118947505951, "step": 2696 }, { "epoch": 0.040272064148604965, "grad_norm": 0.4140625, "grad_norm_var": 0.014027849833170573, "learning_rate": 0.0001, "loss": 1.6591, "loss/crossentropy": 2.7397797107696533, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.22553183883428574, "step": 2697 }, { "epoch": 0.04028699631921994, "grad_norm": 0.404296875, "grad_norm_var": 0.014326222737630208, "learning_rate": 0.0001, "loss": 1.61, "loss/crossentropy": 2.8642138242721558, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2194150611758232, "step": 2698 }, { "epoch": 0.04030192848983492, "grad_norm": 0.42578125, "grad_norm_var": 0.01390544573465983, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.5436079502105713, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.17991328984498978, "step": 2699 }, { "epoch": 0.040316860660449906, "grad_norm": 0.447265625, "grad_norm_var": 0.013670794169108073, "learning_rate": 0.0001, "loss": 1.5604, "loss/crossentropy": 2.463050127029419, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.1854267120361328, "step": 2700 }, { "epoch": 0.04033179283106489, "grad_norm": 0.447265625, "grad_norm_var": 0.01365354855855306, "learning_rate": 0.0001, "loss": 1.7408, "loss/crossentropy": 2.714255452156067, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.2524690628051758, "step": 2701 }, { "epoch": 0.04034672500167987, "grad_norm": 0.3515625, "grad_norm_var": 0.014355707168579101, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.514744758605957, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.16804223507642746, "step": 2702 }, { "epoch": 0.04036165717229485, "grad_norm": 0.404296875, "grad_norm_var": 0.014018758138020834, "learning_rate": 0.0001, "loss": 1.6186, "loss/crossentropy": 2.4954272508621216, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21238742023706436, "step": 2703 }, { "epoch": 0.04037658934290983, "grad_norm": 0.423828125, "grad_norm_var": 0.014069557189941406, "learning_rate": 0.0001, "loss": 1.4894, "loss/crossentropy": 2.5836503505706787, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1807905212044716, "step": 2704 }, { "epoch": 0.04039152151352481, "grad_norm": 0.400390625, "grad_norm_var": 0.01425919532775879, "learning_rate": 0.0001, "loss": 1.7071, "loss/crossentropy": 2.701619267463684, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.23448502272367477, "step": 2705 }, { "epoch": 0.040406453684139795, "grad_norm": 0.52734375, "grad_norm_var": 0.014566278457641602, "learning_rate": 0.0001, "loss": 1.8647, "loss/crossentropy": 2.378059506416321, "loss/fcd": 1.5859375, "loss/idx": 9.0, "loss/logits": 0.27873383462429047, "step": 2706 }, { "epoch": 0.04042138585475478, "grad_norm": 0.39453125, "grad_norm_var": 0.001993672053019206, "learning_rate": 0.0001, "loss": 1.5502, "loss/crossentropy": 2.575024962425232, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.17910057306289673, "step": 2707 }, { "epoch": 0.04043631802536976, "grad_norm": 0.36328125, "grad_norm_var": 0.0022725423177083334, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.6238605976104736, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2021450251340866, "step": 2708 }, { "epoch": 0.040451250195984736, "grad_norm": 0.470703125, "grad_norm_var": 0.0018175601959228515, "learning_rate": 0.0001, "loss": 1.6737, "loss/crossentropy": 2.6365580558776855, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.20887846499681473, "step": 2709 }, { "epoch": 0.04046618236659972, "grad_norm": 0.345703125, "grad_norm_var": 0.002198648452758789, "learning_rate": 0.0001, "loss": 1.4659, "loss/crossentropy": 2.5106853246688843, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18853846192359924, "step": 2710 }, { "epoch": 0.0404811145372147, "grad_norm": 0.416015625, "grad_norm_var": 0.0021992842356363933, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.816643238067627, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20094604790210724, "step": 2711 }, { "epoch": 0.040496046707829685, "grad_norm": 0.369140625, "grad_norm_var": 0.002159563700358073, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.481678605079651, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.18806003779172897, "step": 2712 }, { "epoch": 0.04051097887844467, "grad_norm": 0.41015625, "grad_norm_var": 0.002159881591796875, "learning_rate": 0.0001, "loss": 1.6817, "loss/crossentropy": 2.5615795850753784, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2597897946834564, "step": 2713 }, { "epoch": 0.04052591104905965, "grad_norm": 0.396484375, "grad_norm_var": 0.0021723429361979168, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.5300809144973755, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2027224898338318, "step": 2714 }, { "epoch": 0.040540843219674626, "grad_norm": 0.375, "grad_norm_var": 0.0022409439086914064, "learning_rate": 0.0001, "loss": 1.5091, "loss/crossentropy": 2.5511640310287476, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1809980645775795, "step": 2715 }, { "epoch": 0.04055577539028961, "grad_norm": 0.3984375, "grad_norm_var": 0.0021404107411702473, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.603204607963562, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.21694742888212204, "step": 2716 }, { "epoch": 0.04057070756090459, "grad_norm": 0.46875, "grad_norm_var": 0.0022878011067708334, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.5810707807540894, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.22165369987487793, "step": 2717 }, { "epoch": 0.040585639731519574, "grad_norm": 0.42578125, "grad_norm_var": 0.0020812352498372397, "learning_rate": 0.0001, "loss": 1.5255, "loss/crossentropy": 2.600430488586426, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20123173296451569, "step": 2718 }, { "epoch": 0.040600571902134557, "grad_norm": 0.439453125, "grad_norm_var": 0.0021230061848958332, "learning_rate": 0.0001, "loss": 1.7753, "loss/crossentropy": 2.438339352607727, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.25966930389404297, "step": 2719 }, { "epoch": 0.04061550407274953, "grad_norm": 0.388671875, "grad_norm_var": 0.0021544774373372395, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.6241776943206787, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1803080439567566, "step": 2720 }, { "epoch": 0.040630436243364515, "grad_norm": 0.396484375, "grad_norm_var": 0.002161407470703125, "learning_rate": 0.0001, "loss": 1.5135, "loss/crossentropy": 2.6298223733901978, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1970556527376175, "step": 2721 }, { "epoch": 0.0406453684139795, "grad_norm": 0.478515625, "grad_norm_var": 0.0015570163726806641, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.744978904724121, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.20943140983581543, "step": 2722 }, { "epoch": 0.04066030058459448, "grad_norm": 0.416015625, "grad_norm_var": 0.0015456517537434896, "learning_rate": 0.0001, "loss": 1.6676, "loss/crossentropy": 2.57023823261261, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.21837153285741806, "step": 2723 }, { "epoch": 0.04067523275520946, "grad_norm": 0.40625, "grad_norm_var": 0.001393890380859375, "learning_rate": 0.0001, "loss": 1.6685, "loss/crossentropy": 2.450881004333496, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.21535152196884155, "step": 2724 }, { "epoch": 0.040690164925824446, "grad_norm": 0.435546875, "grad_norm_var": 0.0011987686157226562, "learning_rate": 0.0001, "loss": 1.5253, "loss/crossentropy": 2.7840172052383423, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18541684746742249, "step": 2725 }, { "epoch": 0.04070509709643942, "grad_norm": 0.392578125, "grad_norm_var": 0.0009317398071289062, "learning_rate": 0.0001, "loss": 1.645, "loss/crossentropy": 2.533411145210266, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.21142403781414032, "step": 2726 }, { "epoch": 0.040720029267054404, "grad_norm": 0.447265625, "grad_norm_var": 0.0010039647420247396, "learning_rate": 0.0001, "loss": 1.6427, "loss/crossentropy": 2.515592932701111, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2052415907382965, "step": 2727 }, { "epoch": 0.04073496143766939, "grad_norm": 0.373046875, "grad_norm_var": 0.0009808858235677083, "learning_rate": 0.0001, "loss": 1.5722, "loss/crossentropy": 2.561529040336609, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.2166931927204132, "step": 2728 }, { "epoch": 0.04074989360828437, "grad_norm": 0.59375, "grad_norm_var": 0.0029560724894205728, "learning_rate": 0.0001, "loss": 1.862, "loss/crossentropy": 2.9094104766845703, "loss/fcd": 1.6015625, "loss/idx": 9.0, "loss/logits": 0.2604585587978363, "step": 2729 }, { "epoch": 0.04076482577889935, "grad_norm": 0.419921875, "grad_norm_var": 0.0028950373331705728, "learning_rate": 0.0001, "loss": 1.572, "loss/crossentropy": 2.4933136701583862, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19697245955467224, "step": 2730 }, { "epoch": 0.04077975794951433, "grad_norm": 0.421875, "grad_norm_var": 0.002698198954264323, "learning_rate": 0.0001, "loss": 1.654, "loss/crossentropy": 2.5807889699935913, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2165011167526245, "step": 2731 }, { "epoch": 0.04079469012012931, "grad_norm": 0.390625, "grad_norm_var": 0.002736345926920573, "learning_rate": 0.0001, "loss": 1.7059, "loss/crossentropy": 2.6635286808013916, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.24105119705200195, "step": 2732 }, { "epoch": 0.040809622290744293, "grad_norm": 0.466796875, "grad_norm_var": 0.00272672971089681, "learning_rate": 0.0001, "loss": 2.2837, "loss/crossentropy": 2.275197982788086, "loss/fcd": 1.9296875, "loss/idx": 9.0, "loss/logits": 0.3540118336677551, "step": 2733 }, { "epoch": 0.040824554461359276, "grad_norm": 0.361328125, "grad_norm_var": 0.0030293782552083333, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.5261255502700806, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17782053351402283, "step": 2734 }, { "epoch": 0.04083948663197426, "grad_norm": 0.3984375, "grad_norm_var": 0.003065093358357747, "learning_rate": 0.0001, "loss": 1.6261, "loss/crossentropy": 2.7033716440200806, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.21597259491682053, "step": 2735 }, { "epoch": 0.04085441880258924, "grad_norm": 0.408203125, "grad_norm_var": 0.002996428807576497, "learning_rate": 0.0001, "loss": 1.7314, "loss/crossentropy": 2.671252727508545, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.24701882898807526, "step": 2736 }, { "epoch": 0.04086935097320422, "grad_norm": 0.45703125, "grad_norm_var": 0.0029919942220052085, "learning_rate": 0.0001, "loss": 1.6248, "loss/crossentropy": 2.8483322858810425, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.23027317225933075, "step": 2737 }, { "epoch": 0.0408842831438192, "grad_norm": 0.359375, "grad_norm_var": 0.0030957380930582683, "learning_rate": 0.0001, "loss": 1.5472, "loss/crossentropy": 2.6109222173690796, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.18787486106157303, "step": 2738 }, { "epoch": 0.04089921531443418, "grad_norm": 0.384765625, "grad_norm_var": 0.003180678685506185, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.6670740842819214, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.156435988843441, "step": 2739 }, { "epoch": 0.040914147485049165, "grad_norm": 0.40625, "grad_norm_var": 0.003180678685506185, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.482179880142212, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.15574558824300766, "step": 2740 }, { "epoch": 0.04092907965566415, "grad_norm": 0.396484375, "grad_norm_var": 0.0031940301259358725, "learning_rate": 0.0001, "loss": 1.5649, "loss/crossentropy": 2.51253080368042, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.1898946613073349, "step": 2741 }, { "epoch": 0.040944011826279124, "grad_norm": 0.34375, "grad_norm_var": 0.003504371643066406, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.613757848739624, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18437838554382324, "step": 2742 }, { "epoch": 0.04095894399689411, "grad_norm": 0.43359375, "grad_norm_var": 0.003455972671508789, "learning_rate": 0.0001, "loss": 1.5813, "loss/crossentropy": 2.653122067451477, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2101701721549034, "step": 2743 }, { "epoch": 0.04097387616750909, "grad_norm": 0.39453125, "grad_norm_var": 0.0033690770467122394, "learning_rate": 0.0001, "loss": 1.5732, "loss/crossentropy": 2.4571194648742676, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19424960017204285, "step": 2744 }, { "epoch": 0.04098880833812407, "grad_norm": 0.400390625, "grad_norm_var": 0.0010921319325764975, "learning_rate": 0.0001, "loss": 1.63, "loss/crossentropy": 2.671294689178467, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2120709791779518, "step": 2745 }, { "epoch": 0.041003740508739055, "grad_norm": 0.404296875, "grad_norm_var": 0.0010715325673421223, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.824586510658264, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17529985308647156, "step": 2746 }, { "epoch": 0.04101867267935404, "grad_norm": 0.341796875, "grad_norm_var": 0.0012572606404622395, "learning_rate": 0.0001, "loss": 1.5434, "loss/crossentropy": 2.4863226413726807, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19180522114038467, "step": 2747 }, { "epoch": 0.04103360484996901, "grad_norm": 0.390625, "grad_norm_var": 0.0012572606404622395, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.5521801710128784, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20474105328321457, "step": 2748 }, { "epoch": 0.041048537020583996, "grad_norm": 0.392578125, "grad_norm_var": 0.0009081522623697917, "learning_rate": 0.0001, "loss": 1.6839, "loss/crossentropy": 2.5532721281051636, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.20339557528495789, "step": 2749 }, { "epoch": 0.04106346919119898, "grad_norm": 0.447265625, "grad_norm_var": 0.0010172526041666667, "learning_rate": 0.0001, "loss": 1.7777, "loss/crossentropy": 2.348669171333313, "loss/fcd": 1.55859375, "loss/idx": 9.0, "loss/logits": 0.21912362426519394, "step": 2750 }, { "epoch": 0.04107840136181396, "grad_norm": 0.470703125, "grad_norm_var": 0.0013530572255452474, "learning_rate": 0.0001, "loss": 1.5912, "loss/crossentropy": 2.4546643495559692, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.18497732281684875, "step": 2751 }, { "epoch": 0.041093333532428944, "grad_norm": 0.421875, "grad_norm_var": 0.0013760884602864584, "learning_rate": 0.0001, "loss": 1.7889, "loss/crossentropy": 2.7084169387817383, "loss/fcd": 1.54296875, "loss/idx": 9.0, "loss/logits": 0.24589860439300537, "step": 2752 }, { "epoch": 0.04110826570304392, "grad_norm": 0.38671875, "grad_norm_var": 0.0011769612630208334, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.639194369316101, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18260712921619415, "step": 2753 }, { "epoch": 0.0411231978736589, "grad_norm": 0.4296875, "grad_norm_var": 0.0011197408040364584, "learning_rate": 0.0001, "loss": 1.5741, "loss/crossentropy": 2.6436243057250977, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19906562566757202, "step": 2754 }, { "epoch": 0.041138130044273885, "grad_norm": 0.421875, "grad_norm_var": 0.0011164188385009766, "learning_rate": 0.0001, "loss": 1.5983, "loss/crossentropy": 2.7103906869888306, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.19600465148687363, "step": 2755 }, { "epoch": 0.04115306221488887, "grad_norm": 1.203125, "grad_norm_var": 0.040921258926391604, "learning_rate": 0.0001, "loss": 2.0532, "loss/crossentropy": 2.8846895694732666, "loss/fcd": 1.5390625, "loss/idx": 9.0, "loss/logits": 0.5141147449612617, "step": 2756 }, { "epoch": 0.04116799438550385, "grad_norm": 0.38671875, "grad_norm_var": 0.04100335439046224, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.5399192571640015, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.1716945543885231, "step": 2757 }, { "epoch": 0.04118292655611883, "grad_norm": 0.357421875, "grad_norm_var": 0.04081343015034993, "learning_rate": 0.0001, "loss": 1.5449, "loss/crossentropy": 2.6199474334716797, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19729125499725342, "step": 2758 }, { "epoch": 0.04119785872673381, "grad_norm": 0.427734375, "grad_norm_var": 0.04083245595296224, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.6731573343276978, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2039276659488678, "step": 2759 }, { "epoch": 0.04121279089734879, "grad_norm": 0.400390625, "grad_norm_var": 0.040787490208943684, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.814616084098816, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18165452778339386, "step": 2760 }, { "epoch": 0.041227723067963774, "grad_norm": 0.45703125, "grad_norm_var": 0.040574073791503906, "learning_rate": 0.0001, "loss": 1.6974, "loss/crossentropy": 2.7427507638931274, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.24427790194749832, "step": 2761 }, { "epoch": 0.04124265523857876, "grad_norm": 0.38671875, "grad_norm_var": 0.04072098731994629, "learning_rate": 0.0001, "loss": 1.558, "loss/crossentropy": 2.5998035669326782, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20641548186540604, "step": 2762 }, { "epoch": 0.04125758740919374, "grad_norm": 0.416015625, "grad_norm_var": 0.03991888364156087, "learning_rate": 0.0001, "loss": 1.6029, "loss/crossentropy": 2.4251781702041626, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2083486169576645, "step": 2763 }, { "epoch": 0.041272519579808715, "grad_norm": 0.453125, "grad_norm_var": 0.03956589698791504, "learning_rate": 0.0001, "loss": 1.8759, "loss/crossentropy": 2.4615890979766846, "loss/fcd": 1.62890625, "loss/idx": 9.0, "loss/logits": 0.24696273356676102, "step": 2764 }, { "epoch": 0.0412874517504237, "grad_norm": 0.4140625, "grad_norm_var": 0.039383888244628906, "learning_rate": 0.0001, "loss": 1.6193, "loss/crossentropy": 2.6762359142303467, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21303590387105942, "step": 2765 }, { "epoch": 0.04130238392103868, "grad_norm": 0.3984375, "grad_norm_var": 0.03966482480367025, "learning_rate": 0.0001, "loss": 1.6034, "loss/crossentropy": 2.419522285461426, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.2050010859966278, "step": 2766 }, { "epoch": 0.041317316091653664, "grad_norm": 0.3359375, "grad_norm_var": 0.04068806966145833, "learning_rate": 0.0001, "loss": 1.4825, "loss/crossentropy": 2.5919898748397827, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18171725422143936, "step": 2767 }, { "epoch": 0.041332248262268646, "grad_norm": 0.41015625, "grad_norm_var": 0.04075005849202474, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.6908682584762573, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.1811361163854599, "step": 2768 }, { "epoch": 0.04134718043288363, "grad_norm": 0.42578125, "grad_norm_var": 0.040488115946451825, "learning_rate": 0.0001, "loss": 1.7394, "loss/crossentropy": 2.579013228416443, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.2354956641793251, "step": 2769 }, { "epoch": 0.041362112603498605, "grad_norm": 0.40234375, "grad_norm_var": 0.04063720703125, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.5948243141174316, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.21997284144163132, "step": 2770 }, { "epoch": 0.04137704477411359, "grad_norm": 0.64453125, "grad_norm_var": 0.042720985412597653, "learning_rate": 0.0001, "loss": 1.657, "loss/crossentropy": 2.5443456172943115, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.21949149668216705, "step": 2771 }, { "epoch": 0.04139197694472857, "grad_norm": 0.53515625, "grad_norm_var": 0.005310821533203125, "learning_rate": 0.0001, "loss": 1.718, "loss/crossentropy": 2.3682303428649902, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.21796021610498428, "step": 2772 }, { "epoch": 0.04140690911534355, "grad_norm": 0.478515625, "grad_norm_var": 0.005329497655232747, "learning_rate": 0.0001, "loss": 1.7346, "loss/crossentropy": 2.4975554943084717, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.22679634392261505, "step": 2773 }, { "epoch": 0.041421841285958536, "grad_norm": 0.458984375, "grad_norm_var": 0.00493772824605306, "learning_rate": 0.0001, "loss": 1.7005, "loss/crossentropy": 2.4736135005950928, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.22390922904014587, "step": 2774 }, { "epoch": 0.04143677345657352, "grad_norm": 0.423828125, "grad_norm_var": 0.004945230484008789, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.6240592002868652, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1756979376077652, "step": 2775 }, { "epoch": 0.041451705627188494, "grad_norm": 0.349609375, "grad_norm_var": 0.005375019709269206, "learning_rate": 0.0001, "loss": 1.4791, "loss/crossentropy": 2.4679059982299805, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18221387267112732, "step": 2776 }, { "epoch": 0.04146663779780348, "grad_norm": 0.38671875, "grad_norm_var": 0.005495182673136393, "learning_rate": 0.0001, "loss": 1.5905, "loss/crossentropy": 2.6297004222869873, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.21545636653900146, "step": 2777 }, { "epoch": 0.04148156996841846, "grad_norm": 0.3515625, "grad_norm_var": 0.005787007013956706, "learning_rate": 0.0001, "loss": 1.4676, "loss/crossentropy": 2.473557233810425, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.17849791049957275, "step": 2778 }, { "epoch": 0.04149650213903344, "grad_norm": 0.396484375, "grad_norm_var": 0.005848042170206706, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.6000512838363647, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.16480034589767456, "step": 2779 }, { "epoch": 0.041511434309648425, "grad_norm": 0.359375, "grad_norm_var": 0.006096760431925456, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.486622452735901, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1700567975640297, "step": 2780 }, { "epoch": 0.0415263664802634, "grad_norm": 0.396484375, "grad_norm_var": 0.006137530008951823, "learning_rate": 0.0001, "loss": 1.5265, "loss/crossentropy": 2.8068591356277466, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18665680289268494, "step": 2781 }, { "epoch": 0.04154129865087838, "grad_norm": 0.37109375, "grad_norm_var": 0.006270599365234375, "learning_rate": 0.0001, "loss": 1.68, "loss/crossentropy": 2.1927995085716248, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.23466790467500687, "step": 2782 }, { "epoch": 0.041556230821493366, "grad_norm": 0.380859375, "grad_norm_var": 0.005890766779581706, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.5804786682128906, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.20449081808328629, "step": 2783 }, { "epoch": 0.04157116299210835, "grad_norm": 0.765625, "grad_norm_var": 0.013169082005818684, "learning_rate": 0.0001, "loss": 1.8582, "loss/crossentropy": 2.9243911504745483, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.29574915021657944, "step": 2784 }, { "epoch": 0.04158609516272333, "grad_norm": 0.396484375, "grad_norm_var": 0.013299496968587239, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.646905303001404, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18017712235450745, "step": 2785 }, { "epoch": 0.041601027333338314, "grad_norm": 0.44921875, "grad_norm_var": 0.01317895253499349, "learning_rate": 0.0001, "loss": 1.5819, "loss/crossentropy": 2.742881417274475, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21469488739967346, "step": 2786 }, { "epoch": 0.04161595950395329, "grad_norm": 0.39453125, "grad_norm_var": 0.010485267639160157, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.435250163078308, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.2043372243642807, "step": 2787 }, { "epoch": 0.04163089167456827, "grad_norm": 0.46875, "grad_norm_var": 0.009837849934895834, "learning_rate": 0.0001, "loss": 1.932, "loss/crossentropy": 2.434775948524475, "loss/fcd": 1.65625, "loss/idx": 9.0, "loss/logits": 0.2757757902145386, "step": 2788 }, { "epoch": 0.041645823845183255, "grad_norm": 0.421875, "grad_norm_var": 0.009647480646769206, "learning_rate": 0.0001, "loss": 1.5421, "loss/crossentropy": 2.4317500591278076, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20221126824617386, "step": 2789 }, { "epoch": 0.04166075601579824, "grad_norm": 2.390625, "grad_norm_var": 0.25206146240234373, "learning_rate": 0.0001, "loss": 2.095, "loss/crossentropy": 3.4149022102355957, "loss/fcd": 1.875, "loss/idx": 9.0, "loss/logits": 0.22001376003026962, "step": 2790 }, { "epoch": 0.04167568818641322, "grad_norm": 0.373046875, "grad_norm_var": 0.2530359268188477, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.630018711090088, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17387185990810394, "step": 2791 }, { "epoch": 0.041690620357028196, "grad_norm": 0.38671875, "grad_norm_var": 0.2521761417388916, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.78242826461792, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17589021474123, "step": 2792 }, { "epoch": 0.04170555252764318, "grad_norm": 0.373046875, "grad_norm_var": 0.2524728775024414, "learning_rate": 0.0001, "loss": 1.554, "loss/crossentropy": 2.594369053840637, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19465132057666779, "step": 2793 }, { "epoch": 0.04172048469825816, "grad_norm": 0.484375, "grad_norm_var": 0.25019880930582683, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.6890151500701904, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1606050729751587, "step": 2794 }, { "epoch": 0.041735416868873144, "grad_norm": 0.51953125, "grad_norm_var": 0.24861766497294108, "learning_rate": 0.0001, "loss": 1.9377, "loss/crossentropy": 2.6643046140670776, "loss/fcd": 1.640625, "loss/idx": 9.0, "loss/logits": 0.29704001545906067, "step": 2795 }, { "epoch": 0.04175034903948813, "grad_norm": 0.37109375, "grad_norm_var": 0.2483155409495036, "learning_rate": 0.0001, "loss": 1.4664, "loss/crossentropy": 2.610530138015747, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1694956049323082, "step": 2796 }, { "epoch": 0.04176528121010311, "grad_norm": 0.59765625, "grad_norm_var": 0.24648685455322267, "learning_rate": 0.0001, "loss": 1.741, "loss/crossentropy": 2.6678597927093506, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.22932370007038116, "step": 2797 }, { "epoch": 0.041780213380718086, "grad_norm": 0.4296875, "grad_norm_var": 0.245135498046875, "learning_rate": 0.0001, "loss": 1.7192, "loss/crossentropy": 2.449122667312622, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.23088199645280838, "step": 2798 }, { "epoch": 0.04179514555133307, "grad_norm": 0.3828125, "grad_norm_var": 0.2450851281483968, "learning_rate": 0.0001, "loss": 1.6246, "loss/crossentropy": 2.756286382675171, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.214475616812706, "step": 2799 }, { "epoch": 0.04181007772194805, "grad_norm": 0.361328125, "grad_norm_var": 0.24504235585530598, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.6416287422180176, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20688913762569427, "step": 2800 }, { "epoch": 0.041825009892563034, "grad_norm": 0.345703125, "grad_norm_var": 0.2462432861328125, "learning_rate": 0.0001, "loss": 1.5371, "loss/crossentropy": 2.4355937242507935, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.1933525800704956, "step": 2801 }, { "epoch": 0.041839942063178016, "grad_norm": 0.42578125, "grad_norm_var": 0.24658279418945311, "learning_rate": 0.0001, "loss": 1.5705, "loss/crossentropy": 2.813068985939026, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19937017560005188, "step": 2802 }, { "epoch": 0.04185487423379299, "grad_norm": 0.376953125, "grad_norm_var": 0.2469557285308838, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.6356961727142334, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.21496787667274475, "step": 2803 }, { "epoch": 0.041869806404407975, "grad_norm": 0.390625, "grad_norm_var": 0.24812429745992023, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.8318744897842407, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.1968107670545578, "step": 2804 }, { "epoch": 0.04188473857502296, "grad_norm": 0.392578125, "grad_norm_var": 0.2486371358235677, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.693944811820984, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19249442219734192, "step": 2805 }, { "epoch": 0.04189967074563794, "grad_norm": 0.39453125, "grad_norm_var": 0.00448602040608724, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.9142916202545166, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1631920412182808, "step": 2806 }, { "epoch": 0.04191460291625292, "grad_norm": 0.455078125, "grad_norm_var": 0.004471333821614584, "learning_rate": 0.0001, "loss": 1.6388, "loss/crossentropy": 2.632357358932495, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.21298877894878387, "step": 2807 }, { "epoch": 0.041929535086867906, "grad_norm": 0.486328125, "grad_norm_var": 0.004676421483357747, "learning_rate": 0.0001, "loss": 1.7847, "loss/crossentropy": 2.3999547958374023, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.22218617796897888, "step": 2808 }, { "epoch": 0.04194446725748288, "grad_norm": 0.5703125, "grad_norm_var": 0.00576324462890625, "learning_rate": 0.0001, "loss": 1.7091, "loss/crossentropy": 2.7823559045791626, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.25206881016492844, "step": 2809 }, { "epoch": 0.041959399428097864, "grad_norm": 0.50390625, "grad_norm_var": 0.005911699930826823, "learning_rate": 0.0001, "loss": 1.6885, "loss/crossentropy": 2.59482204914093, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.22753046452999115, "step": 2810 }, { "epoch": 0.04197433159871285, "grad_norm": 0.44140625, "grad_norm_var": 0.00544122060139974, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.6306631565093994, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19637111574411392, "step": 2811 }, { "epoch": 0.04198926376932783, "grad_norm": 0.61328125, "grad_norm_var": 0.007112566630045573, "learning_rate": 0.0001, "loss": 1.7951, "loss/crossentropy": 2.7760783433914185, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.26778970658779144, "step": 2812 }, { "epoch": 0.04200419593994281, "grad_norm": 0.44921875, "grad_norm_var": 0.005527687072753906, "learning_rate": 0.0001, "loss": 1.5329, "loss/crossentropy": 2.5381555557250977, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18527136743068695, "step": 2813 }, { "epoch": 0.04201912811055779, "grad_norm": 0.408203125, "grad_norm_var": 0.005582412083943685, "learning_rate": 0.0001, "loss": 1.6399, "loss/crossentropy": 2.517733931541443, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.20235756039619446, "step": 2814 }, { "epoch": 0.04203406028117277, "grad_norm": 0.427734375, "grad_norm_var": 0.0053817113240559895, "learning_rate": 0.0001, "loss": 1.5085, "loss/crossentropy": 2.85615336894989, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18817099183797836, "step": 2815 }, { "epoch": 0.04204899245178775, "grad_norm": 0.390625, "grad_norm_var": 0.005127318700154622, "learning_rate": 0.0001, "loss": 1.5514, "loss/crossentropy": 2.7043405771255493, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20378877967596054, "step": 2816 }, { "epoch": 0.042063924622402736, "grad_norm": 0.466796875, "grad_norm_var": 0.004488738377888998, "learning_rate": 0.0001, "loss": 1.7543, "loss/crossentropy": 2.737867832183838, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.22308874130249023, "step": 2817 }, { "epoch": 0.04207885679301772, "grad_norm": 0.3515625, "grad_norm_var": 0.005068572362263998, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.6894924640655518, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17666258662939072, "step": 2818 }, { "epoch": 0.0420937889636327, "grad_norm": 0.39453125, "grad_norm_var": 0.00492852528889974, "learning_rate": 0.0001, "loss": 1.6598, "loss/crossentropy": 2.6746203899383545, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2183963507413864, "step": 2819 }, { "epoch": 0.04210872113424768, "grad_norm": 0.53515625, "grad_norm_var": 0.005166117350260417, "learning_rate": 0.0001, "loss": 1.6467, "loss/crossentropy": 2.9185707569122314, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.2014087662100792, "step": 2820 }, { "epoch": 0.04212365330486266, "grad_norm": 0.48828125, "grad_norm_var": 0.004941034317016602, "learning_rate": 0.0001, "loss": 1.663, "loss/crossentropy": 2.9125460386276245, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.23336027562618256, "step": 2821 }, { "epoch": 0.04213858547547764, "grad_norm": 0.404296875, "grad_norm_var": 0.0048603693644205725, "learning_rate": 0.0001, "loss": 1.5936, "loss/crossentropy": 2.859641909599304, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.20692577213048935, "step": 2822 }, { "epoch": 0.042153517646092625, "grad_norm": 0.48046875, "grad_norm_var": 0.004878346125284831, "learning_rate": 0.0001, "loss": 1.6354, "loss/crossentropy": 2.5753769874572754, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21747109293937683, "step": 2823 }, { "epoch": 0.04216844981670761, "grad_norm": 0.357421875, "grad_norm_var": 0.005520359675089518, "learning_rate": 0.0001, "loss": 1.5683, "loss/crossentropy": 2.6137914657592773, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20893336832523346, "step": 2824 }, { "epoch": 0.042183381987322584, "grad_norm": 0.380859375, "grad_norm_var": 0.0048558553059895836, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.574586272239685, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.22023839503526688, "step": 2825 }, { "epoch": 0.042198314157937566, "grad_norm": 0.37109375, "grad_norm_var": 0.004886118570963541, "learning_rate": 0.0001, "loss": 1.3687, "loss/crossentropy": 2.6594600677490234, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.16168400645256042, "step": 2826 }, { "epoch": 0.04221324632855255, "grad_norm": 0.392578125, "grad_norm_var": 0.004993804295857747, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.5055553913116455, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1862962692975998, "step": 2827 }, { "epoch": 0.04222817849916753, "grad_norm": 0.419921875, "grad_norm_var": 0.0026570638020833332, "learning_rate": 0.0001, "loss": 1.5522, "loss/crossentropy": 2.793365478515625, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1967291608452797, "step": 2828 }, { "epoch": 0.042243110669782515, "grad_norm": 0.36328125, "grad_norm_var": 0.0027829488118489582, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.442718744277954, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20197969675064087, "step": 2829 }, { "epoch": 0.0422580428403975, "grad_norm": 0.490234375, "grad_norm_var": 0.0031340916951497397, "learning_rate": 0.0001, "loss": 1.5865, "loss/crossentropy": 2.68276309967041, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.207596093416214, "step": 2830 }, { "epoch": 0.04227297501101247, "grad_norm": 0.4765625, "grad_norm_var": 0.00333555539449056, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.663111686706543, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19037442654371262, "step": 2831 }, { "epoch": 0.042287907181627456, "grad_norm": 0.388671875, "grad_norm_var": 0.003344154357910156, "learning_rate": 0.0001, "loss": 1.5105, "loss/crossentropy": 2.531092643737793, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19016867876052856, "step": 2832 }, { "epoch": 0.04230283935224244, "grad_norm": 0.46484375, "grad_norm_var": 0.0033328851064046224, "learning_rate": 0.0001, "loss": 1.6411, "loss/crossentropy": 2.619768261909485, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.21926890313625336, "step": 2833 }, { "epoch": 0.04231777152285742, "grad_norm": 0.57421875, "grad_norm_var": 0.00432585080464681, "learning_rate": 0.0001, "loss": 2.0097, "loss/crossentropy": 2.566089630126953, "loss/fcd": 1.73046875, "loss/idx": 9.0, "loss/logits": 0.27919960021972656, "step": 2834 }, { "epoch": 0.042332703693472404, "grad_norm": 0.3984375, "grad_norm_var": 0.004304997126261393, "learning_rate": 0.0001, "loss": 1.5583, "loss/crossentropy": 2.725817561149597, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2106863260269165, "step": 2835 }, { "epoch": 0.042347635864087387, "grad_norm": 0.4140625, "grad_norm_var": 0.0036309401194254557, "learning_rate": 0.0001, "loss": 1.4887, "loss/crossentropy": 2.7062512636184692, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19961658865213394, "step": 2836 }, { "epoch": 0.04236256803470236, "grad_norm": 0.4140625, "grad_norm_var": 0.003389342625935872, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.2744503021240234, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.1511535719037056, "step": 2837 }, { "epoch": 0.042377500205317345, "grad_norm": 0.349609375, "grad_norm_var": 0.0037231286366780597, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.603350043296814, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20191775262355804, "step": 2838 }, { "epoch": 0.04239243237593233, "grad_norm": 0.46484375, "grad_norm_var": 0.0036145369211832683, "learning_rate": 0.0001, "loss": 1.7878, "loss/crossentropy": 2.5892986059188843, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.25652115046977997, "step": 2839 }, { "epoch": 0.04240736454654731, "grad_norm": 0.384765625, "grad_norm_var": 0.0034329573313395183, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.5174341201782227, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.1968633010983467, "step": 2840 }, { "epoch": 0.04242229671716229, "grad_norm": 0.451171875, "grad_norm_var": 0.0033585707346598308, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.707062005996704, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17146320641040802, "step": 2841 }, { "epoch": 0.04243722888777727, "grad_norm": 0.376953125, "grad_norm_var": 0.003317705790201823, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.7578189373016357, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18811318278312683, "step": 2842 }, { "epoch": 0.04245216105839225, "grad_norm": 0.423828125, "grad_norm_var": 0.003237342834472656, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.5047038793563843, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18082007765769958, "step": 2843 }, { "epoch": 0.042467093229007234, "grad_norm": 0.4296875, "grad_norm_var": 0.0032321770985921225, "learning_rate": 0.0001, "loss": 1.6164, "loss/crossentropy": 2.4976354837417603, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.19840535521507263, "step": 2844 }, { "epoch": 0.04248202539962222, "grad_norm": 0.38671875, "grad_norm_var": 0.00306089719136556, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.7203586101531982, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18125635385513306, "step": 2845 }, { "epoch": 0.0424969575702372, "grad_norm": 0.392578125, "grad_norm_var": 0.0028796990712483725, "learning_rate": 0.0001, "loss": 1.4895, "loss/crossentropy": 2.8402888774871826, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18480675667524338, "step": 2846 }, { "epoch": 0.04251188974085218, "grad_norm": 0.431640625, "grad_norm_var": 0.002693621317545573, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.5093201398849487, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20625276118516922, "step": 2847 }, { "epoch": 0.04252682191146716, "grad_norm": 0.380859375, "grad_norm_var": 0.002731768290201823, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.7247687578201294, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.1955927386879921, "step": 2848 }, { "epoch": 0.04254175408208214, "grad_norm": 0.421875, "grad_norm_var": 0.0025967915852864584, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.6076769828796387, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18719633668661118, "step": 2849 }, { "epoch": 0.04255668625269712, "grad_norm": 0.515625, "grad_norm_var": 0.0015944798787434896, "learning_rate": 0.0001, "loss": 1.796, "loss/crossentropy": 2.7607619762420654, "loss/fcd": 1.5390625, "loss/idx": 9.0, "loss/logits": 0.2569810450077057, "step": 2850 }, { "epoch": 0.042571618423312106, "grad_norm": 0.357421875, "grad_norm_var": 0.001789077123006185, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.402793049812317, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1580381616950035, "step": 2851 }, { "epoch": 0.04258655059392709, "grad_norm": 0.47265625, "grad_norm_var": 0.0020179589589436847, "learning_rate": 0.0001, "loss": 1.6538, "loss/crossentropy": 2.3532371520996094, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.22409392148256302, "step": 2852 }, { "epoch": 0.042601482764542065, "grad_norm": 0.419921875, "grad_norm_var": 0.0020186742146809894, "learning_rate": 0.0001, "loss": 1.5901, "loss/crossentropy": 2.58163845539093, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.19556696712970734, "step": 2853 }, { "epoch": 0.04261641493515705, "grad_norm": 0.37109375, "grad_norm_var": 0.0018565972646077474, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.5678194761276245, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.15544519573450089, "step": 2854 }, { "epoch": 0.04263134710577203, "grad_norm": 0.361328125, "grad_norm_var": 0.0018742879231770833, "learning_rate": 0.0001, "loss": 1.4611, "loss/crossentropy": 2.658408284187317, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17984597384929657, "step": 2855 }, { "epoch": 0.04264627927638701, "grad_norm": 0.416015625, "grad_norm_var": 0.0018254597981770834, "learning_rate": 0.0001, "loss": 1.5411, "loss/crossentropy": 2.5462645292282104, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.18564967811107635, "step": 2856 }, { "epoch": 0.042661211447001995, "grad_norm": 0.44140625, "grad_norm_var": 0.0017818291982014975, "learning_rate": 0.0001, "loss": 1.576, "loss/crossentropy": 2.6586694717407227, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.19322709739208221, "step": 2857 }, { "epoch": 0.04267614361761698, "grad_norm": 0.46875, "grad_norm_var": 0.0018737157185872396, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.3834999799728394, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.18217167258262634, "step": 2858 }, { "epoch": 0.042691075788231954, "grad_norm": 0.39453125, "grad_norm_var": 0.0019054253896077474, "learning_rate": 0.0001, "loss": 1.836, "loss/crossentropy": 2.538835644721985, "loss/fcd": 1.578125, "loss/idx": 9.0, "loss/logits": 0.25788363814353943, "step": 2859 }, { "epoch": 0.04270600795884694, "grad_norm": 0.404296875, "grad_norm_var": 0.0019006729125976562, "learning_rate": 0.0001, "loss": 1.6709, "loss/crossentropy": 2.553062081336975, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.22171758115291595, "step": 2860 }, { "epoch": 0.04272094012946192, "grad_norm": 0.359375, "grad_norm_var": 0.0020497639973958335, "learning_rate": 0.0001, "loss": 1.5975, "loss/crossentropy": 2.441391110420227, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.19517482817173004, "step": 2861 }, { "epoch": 0.0427358723000769, "grad_norm": 0.625, "grad_norm_var": 0.004790480931599935, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.6342281103134155, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.22613313049077988, "step": 2862 }, { "epoch": 0.042750804470691885, "grad_norm": 0.369140625, "grad_norm_var": 0.005001052220662435, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.602490782737732, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.2059965804219246, "step": 2863 }, { "epoch": 0.04276573664130686, "grad_norm": 0.3828125, "grad_norm_var": 0.004990132649739584, "learning_rate": 0.0001, "loss": 1.6309, "loss/crossentropy": 2.732432246208191, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.2167930006980896, "step": 2864 }, { "epoch": 0.04278066881192184, "grad_norm": 0.5625, "grad_norm_var": 0.006189473470052083, "learning_rate": 0.0001, "loss": 1.6242, "loss/crossentropy": 2.620902419090271, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.21404869854450226, "step": 2865 }, { "epoch": 0.042795600982536826, "grad_norm": 0.5703125, "grad_norm_var": 0.006981658935546875, "learning_rate": 0.0001, "loss": 1.7733, "loss/crossentropy": 2.567645788192749, "loss/fcd": 1.55078125, "loss/idx": 9.0, "loss/logits": 0.22251295298337936, "step": 2866 }, { "epoch": 0.04281053315315181, "grad_norm": 2.765625, "grad_norm_var": 0.34420456886291506, "learning_rate": 0.0001, "loss": 2.2277, "loss/crossentropy": 2.731574773788452, "loss/fcd": 1.85546875, "loss/idx": 9.0, "loss/logits": 0.372261717915535, "step": 2867 }, { "epoch": 0.04282546532376679, "grad_norm": 0.458984375, "grad_norm_var": 0.34442386627197263, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.7929354906082153, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1868971884250641, "step": 2868 }, { "epoch": 0.042840397494381774, "grad_norm": 0.478515625, "grad_norm_var": 0.34334335327148435, "learning_rate": 0.0001, "loss": 1.6066, "loss/crossentropy": 2.4592596292495728, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.20035922527313232, "step": 2869 }, { "epoch": 0.04285532966499675, "grad_norm": 0.451171875, "grad_norm_var": 0.3414137363433838, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.7105917930603027, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2364177256822586, "step": 2870 }, { "epoch": 0.04287026183561173, "grad_norm": 0.435546875, "grad_norm_var": 0.3394519646962484, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.6199839115142822, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.16359961032867432, "step": 2871 }, { "epoch": 0.042885194006226715, "grad_norm": 0.419921875, "grad_norm_var": 0.3393576145172119, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.558248281478882, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1857428103685379, "step": 2872 }, { "epoch": 0.0429001261768417, "grad_norm": 0.412109375, "grad_norm_var": 0.3400278091430664, "learning_rate": 0.0001, "loss": 1.5908, "loss/crossentropy": 2.518897771835327, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20796960592269897, "step": 2873 }, { "epoch": 0.04291505834745668, "grad_norm": 0.443359375, "grad_norm_var": 0.3405036767323812, "learning_rate": 0.0001, "loss": 1.7094, "loss/crossentropy": 2.793637275695801, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.2328309267759323, "step": 2874 }, { "epoch": 0.042929990518071656, "grad_norm": 0.41796875, "grad_norm_var": 0.3399089654286703, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.571768045425415, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.20749187469482422, "step": 2875 }, { "epoch": 0.04294492268868664, "grad_norm": 0.43359375, "grad_norm_var": 0.339208730061849, "learning_rate": 0.0001, "loss": 1.8362, "loss/crossentropy": 2.642799735069275, "loss/fcd": 1.58984375, "loss/idx": 9.0, "loss/logits": 0.24636705219745636, "step": 2876 }, { "epoch": 0.04295985485930162, "grad_norm": 0.369140625, "grad_norm_var": 0.33890252113342284, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.6782093048095703, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.17766575515270233, "step": 2877 }, { "epoch": 0.042974787029916604, "grad_norm": 0.427734375, "grad_norm_var": 0.3406700134277344, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.5905710458755493, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.23781238496303558, "step": 2878 }, { "epoch": 0.04298971920053159, "grad_norm": 0.5078125, "grad_norm_var": 0.33783631324768065, "learning_rate": 0.0001, "loss": 1.578, "loss/crossentropy": 2.6587504148483276, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.1951412633061409, "step": 2879 }, { "epoch": 0.04300465137114657, "grad_norm": 0.46875, "grad_norm_var": 0.3358543237050374, "learning_rate": 0.0001, "loss": 1.6473, "loss/crossentropy": 2.855278968811035, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22541087865829468, "step": 2880 }, { "epoch": 0.043019583541761545, "grad_norm": 0.3515625, "grad_norm_var": 0.339730437596639, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.4527703523635864, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19847294688224792, "step": 2881 }, { "epoch": 0.04303451571237653, "grad_norm": 0.447265625, "grad_norm_var": 0.34097112019856773, "learning_rate": 0.0001, "loss": 1.7102, "loss/crossentropy": 2.655408263206482, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.22973114997148514, "step": 2882 }, { "epoch": 0.04304944788299151, "grad_norm": 0.447265625, "grad_norm_var": 0.0014620304107666015, "learning_rate": 0.0001, "loss": 1.6304, "loss/crossentropy": 2.6561676263809204, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.22026553750038147, "step": 2883 }, { "epoch": 0.043064380053606494, "grad_norm": 0.39453125, "grad_norm_var": 0.00152130126953125, "learning_rate": 0.0001, "loss": 1.5975, "loss/crossentropy": 2.6402101516723633, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.21468909084796906, "step": 2884 }, { "epoch": 0.043079312224221476, "grad_norm": 0.373046875, "grad_norm_var": 0.0015573501586914062, "learning_rate": 0.0001, "loss": 1.5048, "loss/crossentropy": 2.6471338272094727, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.17670994997024536, "step": 2885 }, { "epoch": 0.04309424439483645, "grad_norm": 0.361328125, "grad_norm_var": 0.0017489115397135416, "learning_rate": 0.0001, "loss": 1.4508, "loss/crossentropy": 2.540492057800293, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.16568507254123688, "step": 2886 }, { "epoch": 0.043109176565451435, "grad_norm": 0.34375, "grad_norm_var": 0.0020783583323160807, "learning_rate": 0.0001, "loss": 1.5091, "loss/crossentropy": 2.44563889503479, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19658329337835312, "step": 2887 }, { "epoch": 0.04312410873606642, "grad_norm": 0.357421875, "grad_norm_var": 0.0022706190745035806, "learning_rate": 0.0001, "loss": 1.5988, "loss/crossentropy": 2.443848967552185, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21203475445508957, "step": 2888 }, { "epoch": 0.0431390409066814, "grad_norm": 0.5078125, "grad_norm_var": 0.002872657775878906, "learning_rate": 0.0001, "loss": 1.7245, "loss/crossentropy": 2.8028016090393066, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2596260756254196, "step": 2889 }, { "epoch": 0.04315397307729638, "grad_norm": 0.390625, "grad_norm_var": 0.0028524875640869142, "learning_rate": 0.0001, "loss": 1.7053, "loss/crossentropy": 2.6603466272354126, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.2287246435880661, "step": 2890 }, { "epoch": 0.043168905247911366, "grad_norm": 0.37109375, "grad_norm_var": 0.0029554843902587892, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.708483934402466, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18917420506477356, "step": 2891 }, { "epoch": 0.04318383741852634, "grad_norm": 0.462890625, "grad_norm_var": 0.0031030654907226564, "learning_rate": 0.0001, "loss": 1.9272, "loss/crossentropy": 2.660986542701721, "loss/fcd": 1.609375, "loss/idx": 9.0, "loss/logits": 0.3178368806838989, "step": 2892 }, { "epoch": 0.043198769589141324, "grad_norm": 0.359375, "grad_norm_var": 0.0031640211741129558, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.5787829160690308, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17180150747299194, "step": 2893 }, { "epoch": 0.04321370175975631, "grad_norm": 0.365234375, "grad_norm_var": 0.003266763687133789, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.774833917617798, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.189126119017601, "step": 2894 }, { "epoch": 0.04322863393037129, "grad_norm": 0.380859375, "grad_norm_var": 0.0025652567545572918, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 2.6927337646484375, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20952176302671432, "step": 2895 }, { "epoch": 0.04324356610098627, "grad_norm": 0.453125, "grad_norm_var": 0.0024350484212239585, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.549205183982849, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18179070204496384, "step": 2896 }, { "epoch": 0.043258498271601255, "grad_norm": 0.443359375, "grad_norm_var": 0.002393960952758789, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.7098318338394165, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2134682834148407, "step": 2897 }, { "epoch": 0.04327343044221623, "grad_norm": 0.478515625, "grad_norm_var": 0.002636575698852539, "learning_rate": 0.0001, "loss": 1.7554, "loss/crossentropy": 2.641385793685913, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.25932417809963226, "step": 2898 }, { "epoch": 0.04328836261283121, "grad_norm": 0.376953125, "grad_norm_var": 0.0025553226470947264, "learning_rate": 0.0001, "loss": 1.5911, "loss/crossentropy": 2.5933384895324707, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20831818133592606, "step": 2899 }, { "epoch": 0.043303294783446196, "grad_norm": 0.375, "grad_norm_var": 0.0025966485341389974, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.547296404838562, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2030114009976387, "step": 2900 }, { "epoch": 0.04331822695406118, "grad_norm": 0.3984375, "grad_norm_var": 0.002545611063639323, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.6054317951202393, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2056415155529976, "step": 2901 }, { "epoch": 0.04333315912467616, "grad_norm": 0.41796875, "grad_norm_var": 0.0024418989817301434, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.6876357793807983, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20386947691440582, "step": 2902 }, { "epoch": 0.04334809129529114, "grad_norm": 0.4609375, "grad_norm_var": 0.002340809504191081, "learning_rate": 0.0001, "loss": 1.5221, "loss/crossentropy": 2.736685037612915, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1939808875322342, "step": 2903 }, { "epoch": 0.04336302346590612, "grad_norm": 0.44921875, "grad_norm_var": 0.002193641662597656, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.50700306892395, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.21700645983219147, "step": 2904 }, { "epoch": 0.0433779556365211, "grad_norm": 0.408203125, "grad_norm_var": 0.0016237735748291016, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.3313392400741577, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.22188710421323776, "step": 2905 }, { "epoch": 0.043392887807136085, "grad_norm": 0.3984375, "grad_norm_var": 0.0016053358713785807, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.986729621887207, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1808188185095787, "step": 2906 }, { "epoch": 0.04340781997775107, "grad_norm": 0.46875, "grad_norm_var": 0.0016625563303629557, "learning_rate": 0.0001, "loss": 1.7529, "loss/crossentropy": 2.362679958343506, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.23723523318767548, "step": 2907 }, { "epoch": 0.04342275214836605, "grad_norm": 0.427734375, "grad_norm_var": 0.0015320936838785807, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.539456367492676, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.2181280255317688, "step": 2908 }, { "epoch": 0.043437684318981026, "grad_norm": 0.462890625, "grad_norm_var": 0.0014149983723958333, "learning_rate": 0.0001, "loss": 1.5861, "loss/crossentropy": 2.745030403137207, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20332366228103638, "step": 2909 }, { "epoch": 0.04345261648959601, "grad_norm": 0.40234375, "grad_norm_var": 0.001215982437133789, "learning_rate": 0.0001, "loss": 1.5938, "loss/crossentropy": 2.6791075468063354, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.2110261246562004, "step": 2910 }, { "epoch": 0.04346754866021099, "grad_norm": 0.365234375, "grad_norm_var": 0.001323556900024414, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.607409715652466, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.18135695904493332, "step": 2911 }, { "epoch": 0.043482480830825974, "grad_norm": 0.40234375, "grad_norm_var": 0.0012888431549072266, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.784993529319763, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19761042296886444, "step": 2912 }, { "epoch": 0.04349741300144096, "grad_norm": 0.38671875, "grad_norm_var": 0.001320648193359375, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.8376271724700928, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.18658211827278137, "step": 2913 }, { "epoch": 0.04351234517205593, "grad_norm": 0.482421875, "grad_norm_var": 0.0013533910115559897, "learning_rate": 0.0001, "loss": 1.6168, "loss/crossentropy": 2.549038887023926, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20663845539093018, "step": 2914 }, { "epoch": 0.043527277342670916, "grad_norm": 0.3515625, "grad_norm_var": 0.0015317122141520181, "learning_rate": 0.0001, "loss": 1.4801, "loss/crossentropy": 2.3261440992355347, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1832374781370163, "step": 2915 }, { "epoch": 0.0435422095132859, "grad_norm": 0.35546875, "grad_norm_var": 0.0016626834869384766, "learning_rate": 0.0001, "loss": 1.6882, "loss/crossentropy": 2.246339499950409, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.23117278516292572, "step": 2916 }, { "epoch": 0.04355714168390088, "grad_norm": 0.41015625, "grad_norm_var": 0.001645517349243164, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.460755228996277, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1844637170433998, "step": 2917 }, { "epoch": 0.043572073854515864, "grad_norm": 0.357421875, "grad_norm_var": 0.0018559137980143229, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.707395076751709, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19699087738990784, "step": 2918 }, { "epoch": 0.043587006025130846, "grad_norm": 0.41796875, "grad_norm_var": 0.0016901652018229167, "learning_rate": 0.0001, "loss": 1.6435, "loss/crossentropy": 2.483980178833008, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22164048999547958, "step": 2919 }, { "epoch": 0.04360193819574582, "grad_norm": 0.3828125, "grad_norm_var": 0.0016112645467122396, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.721411943435669, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1829877644777298, "step": 2920 }, { "epoch": 0.043616870366360805, "grad_norm": 0.34375, "grad_norm_var": 0.0018436272939046225, "learning_rate": 0.0001, "loss": 1.5109, "loss/crossentropy": 2.496207356452942, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1945088878273964, "step": 2921 }, { "epoch": 0.04363180253697579, "grad_norm": 0.416015625, "grad_norm_var": 0.0018569310506184897, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.7425421476364136, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.16742682456970215, "step": 2922 }, { "epoch": 0.04364673470759077, "grad_norm": 0.37109375, "grad_norm_var": 0.0015851338704427084, "learning_rate": 0.0001, "loss": 1.5413, "loss/crossentropy": 2.3732258081436157, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18973706662654877, "step": 2923 }, { "epoch": 0.04366166687820575, "grad_norm": 0.337890625, "grad_norm_var": 0.001709429423014323, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.5498515367507935, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.16787800192832947, "step": 2924 }, { "epoch": 0.04367659904882073, "grad_norm": 0.447265625, "grad_norm_var": 0.0015736262003580729, "learning_rate": 0.0001, "loss": 1.6084, "loss/crossentropy": 2.565731406211853, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.198272705078125, "step": 2925 }, { "epoch": 0.04369153121943571, "grad_norm": 0.37109375, "grad_norm_var": 0.0015807469685872396, "learning_rate": 0.0001, "loss": 1.6013, "loss/crossentropy": 2.42183518409729, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.19507751613855362, "step": 2926 }, { "epoch": 0.043706463390050694, "grad_norm": 0.400390625, "grad_norm_var": 0.0015538533528645833, "learning_rate": 0.0001, "loss": 1.5607, "loss/crossentropy": 2.634357213973999, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.1856776475906372, "step": 2927 }, { "epoch": 0.04372139556066568, "grad_norm": 0.4609375, "grad_norm_var": 0.0018676122029622395, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.569041132926941, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.21562807261943817, "step": 2928 }, { "epoch": 0.04373632773128066, "grad_norm": 0.3984375, "grad_norm_var": 0.0018658955891927083, "learning_rate": 0.0001, "loss": 1.7458, "loss/crossentropy": 2.563103437423706, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.2419409230351448, "step": 2929 }, { "epoch": 0.04375125990189564, "grad_norm": 0.38671875, "grad_norm_var": 0.001310586929321289, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.585684895515442, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1831977739930153, "step": 2930 }, { "epoch": 0.04376619207251062, "grad_norm": 0.51171875, "grad_norm_var": 0.002134307225545247, "learning_rate": 0.0001, "loss": 1.5464, "loss/crossentropy": 2.773849129676819, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.19096774607896805, "step": 2931 }, { "epoch": 0.0437811242431256, "grad_norm": 0.3828125, "grad_norm_var": 0.002025715510050456, "learning_rate": 0.0001, "loss": 1.5388, "loss/crossentropy": 2.3508574962615967, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.17945174127817154, "step": 2932 }, { "epoch": 0.04379605641374058, "grad_norm": 0.380859375, "grad_norm_var": 0.0020388285319010418, "learning_rate": 0.0001, "loss": 1.6858, "loss/crossentropy": 2.607285261154175, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.23661434650421143, "step": 2933 }, { "epoch": 0.043810988584355566, "grad_norm": 0.38671875, "grad_norm_var": 0.0019341627756754556, "learning_rate": 0.0001, "loss": 1.6162, "loss/crossentropy": 2.6566028594970703, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.22556369751691818, "step": 2934 }, { "epoch": 0.04382592075497055, "grad_norm": 0.419921875, "grad_norm_var": 0.0019391377766927083, "learning_rate": 0.0001, "loss": 1.763, "loss/crossentropy": 2.3140666484832764, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.24739393591880798, "step": 2935 }, { "epoch": 0.043840852925585524, "grad_norm": 0.376953125, "grad_norm_var": 0.00195463498433431, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.773358464241028, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20777374505996704, "step": 2936 }, { "epoch": 0.04385578509620051, "grad_norm": 0.392578125, "grad_norm_var": 0.0017404556274414062, "learning_rate": 0.0001, "loss": 1.5246, "loss/crossentropy": 2.627598524093628, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19650404155254364, "step": 2937 }, { "epoch": 0.04387071726681549, "grad_norm": 0.439453125, "grad_norm_var": 0.0018167495727539062, "learning_rate": 0.0001, "loss": 1.5258, "loss/crossentropy": 2.4758599996566772, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.17421112209558487, "step": 2938 }, { "epoch": 0.04388564943743047, "grad_norm": 0.4296875, "grad_norm_var": 0.001773834228515625, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.610916495323181, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16696903854608536, "step": 2939 }, { "epoch": 0.043900581608045455, "grad_norm": 0.40625, "grad_norm_var": 0.0014294783274332683, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.6124664545059204, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19179312139749527, "step": 2940 }, { "epoch": 0.04391551377866044, "grad_norm": 0.44140625, "grad_norm_var": 0.0014040629069010417, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.659586191177368, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20665714144706726, "step": 2941 }, { "epoch": 0.043930445949275414, "grad_norm": 0.46875, "grad_norm_var": 0.0014724095662434896, "learning_rate": 0.0001, "loss": 1.7292, "loss/crossentropy": 2.4600117206573486, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.2292054072022438, "step": 2942 }, { "epoch": 0.043945378119890396, "grad_norm": 0.515625, "grad_norm_var": 0.002036015192667643, "learning_rate": 0.0001, "loss": 1.5914, "loss/crossentropy": 2.762745976448059, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2007603719830513, "step": 2943 }, { "epoch": 0.04396031029050538, "grad_norm": 0.48046875, "grad_norm_var": 0.002153635025024414, "learning_rate": 0.0001, "loss": 1.4913, "loss/crossentropy": 2.632846474647522, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18266583234071732, "step": 2944 }, { "epoch": 0.04397524246112036, "grad_norm": 0.47265625, "grad_norm_var": 0.0022236982981363933, "learning_rate": 0.0001, "loss": 1.5884, "loss/crossentropy": 2.3143118619918823, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.1782917156815529, "step": 2945 }, { "epoch": 0.043990174631735345, "grad_norm": 0.474609375, "grad_norm_var": 0.002190081278483073, "learning_rate": 0.0001, "loss": 1.6665, "loss/crossentropy": 2.464508056640625, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.20164836198091507, "step": 2946 }, { "epoch": 0.04400510680235032, "grad_norm": 0.53125, "grad_norm_var": 0.0024103800455729167, "learning_rate": 0.0001, "loss": 1.6785, "loss/crossentropy": 2.742960810661316, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.2253502532839775, "step": 2947 }, { "epoch": 0.0440200389729653, "grad_norm": 0.38671875, "grad_norm_var": 0.002382850646972656, "learning_rate": 0.0001, "loss": 1.5838, "loss/crossentropy": 2.5539259910583496, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.19314886629581451, "step": 2948 }, { "epoch": 0.044034971143580286, "grad_norm": 0.37109375, "grad_norm_var": 0.002462879816691081, "learning_rate": 0.0001, "loss": 1.6126, "loss/crossentropy": 2.471666097640991, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20241039246320724, "step": 2949 }, { "epoch": 0.04404990331419527, "grad_norm": 0.435546875, "grad_norm_var": 0.002283668518066406, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.6657344102859497, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.2026199847459793, "step": 2950 }, { "epoch": 0.04406483548481025, "grad_norm": 0.375, "grad_norm_var": 0.0025311628977457683, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.432537794113159, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19103725254535675, "step": 2951 }, { "epoch": 0.044079767655425234, "grad_norm": 0.5078125, "grad_norm_var": 0.002547136942545573, "learning_rate": 0.0001, "loss": 1.806, "loss/crossentropy": 2.6769468784332275, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.24351592361927032, "step": 2952 }, { "epoch": 0.04409469982604021, "grad_norm": 0.38671875, "grad_norm_var": 0.0025906721750895184, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.3951520919799805, "loss/fcd": 1.19140625, "loss/idx": 9.0, "loss/logits": 0.16409879177808762, "step": 2953 }, { "epoch": 0.04410963199665519, "grad_norm": 0.4140625, "grad_norm_var": 0.0026503880818684895, "learning_rate": 0.0001, "loss": 1.5803, "loss/crossentropy": 2.7692240476608276, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.19751302897930145, "step": 2954 }, { "epoch": 0.044124564167270175, "grad_norm": 0.40234375, "grad_norm_var": 0.0027478535970052085, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.4797921180725098, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18851931393146515, "step": 2955 }, { "epoch": 0.04413949633788516, "grad_norm": 0.3671875, "grad_norm_var": 0.00302886962890625, "learning_rate": 0.0001, "loss": 1.4355, "loss/crossentropy": 2.5406899452209473, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1737699657678604, "step": 2956 }, { "epoch": 0.04415442850850014, "grad_norm": 0.42578125, "grad_norm_var": 0.003040059407552083, "learning_rate": 0.0001, "loss": 1.6259, "loss/crossentropy": 2.5826886892318726, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2040177583694458, "step": 2957 }, { "epoch": 0.04416936067911512, "grad_norm": 0.451171875, "grad_norm_var": 0.002988417943318685, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.8705650568008423, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1887732893228531, "step": 2958 }, { "epoch": 0.0441842928497301, "grad_norm": 0.3984375, "grad_norm_var": 0.0026241143544514975, "learning_rate": 0.0001, "loss": 1.5813, "loss/crossentropy": 2.642223596572876, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21413271129131317, "step": 2959 }, { "epoch": 0.04419922502034508, "grad_norm": 0.54296875, "grad_norm_var": 0.003288380304972331, "learning_rate": 0.0001, "loss": 1.8229, "loss/crossentropy": 2.267346501350403, "loss/fcd": 1.58203125, "loss/idx": 9.0, "loss/logits": 0.24089457094669342, "step": 2960 }, { "epoch": 0.044214157190960064, "grad_norm": 0.361328125, "grad_norm_var": 0.003488604227701823, "learning_rate": 0.0001, "loss": 1.392, "loss/crossentropy": 2.4762603044509888, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.16547292470932007, "step": 2961 }, { "epoch": 0.04422908936157505, "grad_norm": 0.412109375, "grad_norm_var": 0.003336016337076823, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.56834077835083, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1711532026529312, "step": 2962 }, { "epoch": 0.04424402153219003, "grad_norm": 0.359375, "grad_norm_var": 0.0027037938435872395, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.312143087387085, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.17584850639104843, "step": 2963 }, { "epoch": 0.044258953702805005, "grad_norm": 0.33984375, "grad_norm_var": 0.0030013402303059895, "learning_rate": 0.0001, "loss": 1.5844, "loss/crossentropy": 2.498114824295044, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.205511212348938, "step": 2964 }, { "epoch": 0.04427388587341999, "grad_norm": 0.416015625, "grad_norm_var": 0.0028978824615478516, "learning_rate": 0.0001, "loss": 1.5544, "loss/crossentropy": 2.7555795907974243, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.21063003689050674, "step": 2965 }, { "epoch": 0.04428881804403497, "grad_norm": 0.451171875, "grad_norm_var": 0.00296171506245931, "learning_rate": 0.0001, "loss": 1.5148, "loss/crossentropy": 2.6969597339630127, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19449160993099213, "step": 2966 }, { "epoch": 0.04430375021464995, "grad_norm": 0.384765625, "grad_norm_var": 0.0029179255167643228, "learning_rate": 0.0001, "loss": 1.7047, "loss/crossentropy": 2.741969585418701, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.24767477810382843, "step": 2967 }, { "epoch": 0.044318682385264936, "grad_norm": 0.490234375, "grad_norm_var": 0.0027169386545817056, "learning_rate": 0.0001, "loss": 1.5414, "loss/crossentropy": 2.890550136566162, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18978962302207947, "step": 2968 }, { "epoch": 0.04433361455587992, "grad_norm": 0.466796875, "grad_norm_var": 0.002840105692545573, "learning_rate": 0.0001, "loss": 2.0658, "loss/crossentropy": 2.4640356302261353, "loss/fcd": 1.72265625, "loss/idx": 9.0, "loss/logits": 0.343137726187706, "step": 2969 }, { "epoch": 0.044348546726494895, "grad_norm": 0.37890625, "grad_norm_var": 0.0029345194498697916, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.6923552751541138, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1909312978386879, "step": 2970 }, { "epoch": 0.04436347889710988, "grad_norm": 0.41796875, "grad_norm_var": 0.002922312418619792, "learning_rate": 0.0001, "loss": 1.5844, "loss/crossentropy": 2.258636236190796, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.1898234337568283, "step": 2971 }, { "epoch": 0.04437841106772486, "grad_norm": 0.3671875, "grad_norm_var": 0.002922312418619792, "learning_rate": 0.0001, "loss": 1.6814, "loss/crossentropy": 2.6858255863189697, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.23612558841705322, "step": 2972 }, { "epoch": 0.04439334323833984, "grad_norm": 0.392578125, "grad_norm_var": 0.002950143814086914, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.5708413124084473, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18643417954444885, "step": 2973 }, { "epoch": 0.044408275408954825, "grad_norm": 0.703125, "grad_norm_var": 0.008152008056640625, "learning_rate": 0.0001, "loss": 1.5952, "loss/crossentropy": 2.5309531688690186, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20063798129558563, "step": 2974 }, { "epoch": 0.0444232075795698, "grad_norm": 0.41796875, "grad_norm_var": 0.00809319814046224, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.630118489265442, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2182958871126175, "step": 2975 }, { "epoch": 0.044438139750184784, "grad_norm": 0.353515625, "grad_norm_var": 0.007518116633097331, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.507883310317993, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2148245945572853, "step": 2976 }, { "epoch": 0.04445307192079977, "grad_norm": 0.435546875, "grad_norm_var": 0.007286183039347331, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.663014054298401, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.2189333587884903, "step": 2977 }, { "epoch": 0.04446800409141475, "grad_norm": 0.4921875, "grad_norm_var": 0.007557932535807292, "learning_rate": 0.0001, "loss": 1.5156, "loss/crossentropy": 2.6023967266082764, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18352258205413818, "step": 2978 }, { "epoch": 0.04448293626202973, "grad_norm": 0.421875, "grad_norm_var": 0.007220204671223958, "learning_rate": 0.0001, "loss": 1.6758, "loss/crossentropy": 2.5271655321121216, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.23050003498792648, "step": 2979 }, { "epoch": 0.044497868432644715, "grad_norm": 0.36328125, "grad_norm_var": 0.006963094075520833, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.4342827796936035, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20343679189682007, "step": 2980 }, { "epoch": 0.04451280060325969, "grad_norm": 0.384765625, "grad_norm_var": 0.0071014404296875, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.782238721847534, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.21958937495946884, "step": 2981 }, { "epoch": 0.04452773277387467, "grad_norm": 0.431640625, "grad_norm_var": 0.00707696278889974, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.5226932764053345, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2401016429066658, "step": 2982 }, { "epoch": 0.044542664944489656, "grad_norm": 0.41796875, "grad_norm_var": 0.006939427057902018, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.4377386569976807, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2159440815448761, "step": 2983 }, { "epoch": 0.04455759711510464, "grad_norm": 0.42578125, "grad_norm_var": 0.006711260477701823, "learning_rate": 0.0001, "loss": 1.6313, "loss/crossentropy": 2.6672240495681763, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.20941061526536942, "step": 2984 }, { "epoch": 0.04457252928571962, "grad_norm": 0.349609375, "grad_norm_var": 0.006985918680826823, "learning_rate": 0.0001, "loss": 1.5048, "loss/crossentropy": 2.5309709310531616, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19227334856987, "step": 2985 }, { "epoch": 0.0445874614563346, "grad_norm": 0.3671875, "grad_norm_var": 0.007062021891276042, "learning_rate": 0.0001, "loss": 1.4369, "loss/crossentropy": 2.9034671783447266, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.16735634207725525, "step": 2986 }, { "epoch": 0.04460239362694958, "grad_norm": 0.51953125, "grad_norm_var": 0.007660420735677084, "learning_rate": 0.0001, "loss": 1.8301, "loss/crossentropy": 2.4699528217315674, "loss/fcd": 1.58984375, "loss/idx": 9.0, "loss/logits": 0.24030498415231705, "step": 2987 }, { "epoch": 0.04461732579756456, "grad_norm": 0.40234375, "grad_norm_var": 0.00745385487874349, "learning_rate": 0.0001, "loss": 1.6516, "loss/crossentropy": 2.6117340326309204, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.22187793254852295, "step": 2988 }, { "epoch": 0.044632257968179545, "grad_norm": 0.45703125, "grad_norm_var": 0.007392485936482747, "learning_rate": 0.0001, "loss": 1.5258, "loss/crossentropy": 2.648404359817505, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19372648000717163, "step": 2989 }, { "epoch": 0.04464719013879453, "grad_norm": 0.3515625, "grad_norm_var": 0.002500136693318685, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.4787005186080933, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1807347983121872, "step": 2990 }, { "epoch": 0.04466212230940951, "grad_norm": 0.376953125, "grad_norm_var": 0.0025725682576497396, "learning_rate": 0.0001, "loss": 1.4787, "loss/crossentropy": 2.524003028869629, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18186602741479874, "step": 2991 }, { "epoch": 0.044677054480024486, "grad_norm": 0.421875, "grad_norm_var": 0.002355051040649414, "learning_rate": 0.0001, "loss": 1.6976, "loss/crossentropy": 2.4494649171829224, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.24058599025011063, "step": 2992 }, { "epoch": 0.04469198665063947, "grad_norm": 0.4140625, "grad_norm_var": 0.002321306864420573, "learning_rate": 0.0001, "loss": 1.5928, "loss/crossentropy": 2.7234619855880737, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.1982303187251091, "step": 2993 }, { "epoch": 0.04470691882125445, "grad_norm": 0.412109375, "grad_norm_var": 0.0018696943918863932, "learning_rate": 0.0001, "loss": 1.632, "loss/crossentropy": 2.7120161056518555, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.22570385038852692, "step": 2994 }, { "epoch": 0.044721850991869434, "grad_norm": 0.52734375, "grad_norm_var": 0.0027692000071207683, "learning_rate": 0.0001, "loss": 1.618, "loss/crossentropy": 2.6925458908081055, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.20391300320625305, "step": 2995 }, { "epoch": 0.04473678316248442, "grad_norm": 0.3984375, "grad_norm_var": 0.0026089827219645183, "learning_rate": 0.0001, "loss": 1.474, "loss/crossentropy": 2.573264479637146, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18100111186504364, "step": 2996 }, { "epoch": 0.04475171533309939, "grad_norm": 0.4140625, "grad_norm_var": 0.0025400797526041667, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.6843268871307373, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.21489544212818146, "step": 2997 }, { "epoch": 0.044766647503714375, "grad_norm": 0.353515625, "grad_norm_var": 0.0027791341145833332, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.802879214286804, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18975920975208282, "step": 2998 }, { "epoch": 0.04478157967432936, "grad_norm": 0.447265625, "grad_norm_var": 0.0028518517812093098, "learning_rate": 0.0001, "loss": 1.6435, "loss/crossentropy": 2.4921804666519165, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.21771711856126785, "step": 2999 }, { "epoch": 0.04479651184494434, "grad_norm": 0.42578125, "grad_norm_var": 0.0028518517812093098, "learning_rate": 0.0001, "loss": 1.6329, "loss/crossentropy": 2.8012707233428955, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.21885357797145844, "step": 3000 }, { "epoch": 0.044811444015559324, "grad_norm": 0.3671875, "grad_norm_var": 0.0027180989583333332, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.6076290607452393, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18141985684633255, "step": 3001 }, { "epoch": 0.044826376186174306, "grad_norm": 0.40625, "grad_norm_var": 0.0025591532389322917, "learning_rate": 0.0001, "loss": 1.483, "loss/crossentropy": 2.6484668254852295, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17444197088479996, "step": 3002 }, { "epoch": 0.04484130835678928, "grad_norm": 0.443359375, "grad_norm_var": 0.0018952528635660807, "learning_rate": 0.0001, "loss": 1.6965, "loss/crossentropy": 2.611799478530884, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.23164018988609314, "step": 3003 }, { "epoch": 0.044856240527404265, "grad_norm": 0.384765625, "grad_norm_var": 0.0019411722819010417, "learning_rate": 0.0001, "loss": 1.5712, "loss/crossentropy": 2.5289047956466675, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2039918154478073, "step": 3004 }, { "epoch": 0.04487117269801925, "grad_norm": 0.349609375, "grad_norm_var": 0.0020259698232014973, "learning_rate": 0.0001, "loss": 1.5076, "loss/crossentropy": 2.5928255319595337, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.2028687596321106, "step": 3005 }, { "epoch": 0.04488610486863423, "grad_norm": 0.380859375, "grad_norm_var": 0.0018674214680989584, "learning_rate": 0.0001, "loss": 1.4884, "loss/crossentropy": 2.8270065784454346, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17585495859384537, "step": 3006 }, { "epoch": 0.04490103703924921, "grad_norm": 0.361328125, "grad_norm_var": 0.0019467671712239583, "learning_rate": 0.0001, "loss": 1.5255, "loss/crossentropy": 2.653829336166382, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18954303860664368, "step": 3007 }, { "epoch": 0.04491596920986419, "grad_norm": 0.361328125, "grad_norm_var": 0.002053689956665039, "learning_rate": 0.0001, "loss": 1.5771, "loss/crossentropy": 2.5563769340515137, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.20597627013921738, "step": 3008 }, { "epoch": 0.04493090138047917, "grad_norm": 0.3984375, "grad_norm_var": 0.0020458062489827475, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.5579047203063965, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1779017224907875, "step": 3009 }, { "epoch": 0.044945833551094154, "grad_norm": 0.44921875, "grad_norm_var": 0.0021820068359375, "learning_rate": 0.0001, "loss": 1.6321, "loss/crossentropy": 2.638156771659851, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.21017563343048096, "step": 3010 }, { "epoch": 0.04496076572170914, "grad_norm": 0.421875, "grad_norm_var": 0.0011468887329101562, "learning_rate": 0.0001, "loss": 1.5726, "loss/crossentropy": 2.529091000556946, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.1898174211382866, "step": 3011 }, { "epoch": 0.04497569789232412, "grad_norm": 0.44921875, "grad_norm_var": 0.001313018798828125, "learning_rate": 0.0001, "loss": 1.5679, "loss/crossentropy": 2.5455775260925293, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20462322235107422, "step": 3012 }, { "epoch": 0.0449906300629391, "grad_norm": 0.361328125, "grad_norm_var": 0.0013941287994384765, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.4638450145721436, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18275205790996552, "step": 3013 }, { "epoch": 0.04500556223355408, "grad_norm": 0.392578125, "grad_norm_var": 0.0012599786122639975, "learning_rate": 0.0001, "loss": 1.5416, "loss/crossentropy": 2.5285617113113403, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.18612495809793472, "step": 3014 }, { "epoch": 0.04502049440416906, "grad_norm": 0.41015625, "grad_norm_var": 0.0011123021443684895, "learning_rate": 0.0001, "loss": 1.7839, "loss/crossentropy": 2.5411795377731323, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.27995698153972626, "step": 3015 }, { "epoch": 0.04503542657478404, "grad_norm": 0.40625, "grad_norm_var": 0.0010630289713541667, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.8266077041625977, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17512869834899902, "step": 3016 }, { "epoch": 0.045050358745399026, "grad_norm": 0.4140625, "grad_norm_var": 0.0010172526041666667, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.5405839681625366, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1741039976477623, "step": 3017 }, { "epoch": 0.04506529091601401, "grad_norm": 0.41015625, "grad_norm_var": 0.0010217666625976563, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.638413190841675, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.18711459636688232, "step": 3018 }, { "epoch": 0.045080223086628984, "grad_norm": 0.42578125, "grad_norm_var": 0.0009386539459228516, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.7031253576278687, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20678063482046127, "step": 3019 }, { "epoch": 0.04509515525724397, "grad_norm": 0.373046875, "grad_norm_var": 0.0009687900543212891, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.6044106483459473, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20928020775318146, "step": 3020 }, { "epoch": 0.04511008742785895, "grad_norm": 0.462890625, "grad_norm_var": 0.0010425408681233725, "learning_rate": 0.0001, "loss": 1.7139, "loss/crossentropy": 2.528980016708374, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.22956640273332596, "step": 3021 }, { "epoch": 0.04512501959847393, "grad_norm": 0.42578125, "grad_norm_var": 0.001024627685546875, "learning_rate": 0.0001, "loss": 1.7246, "loss/crossentropy": 2.5235766172409058, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.22458337992429733, "step": 3022 }, { "epoch": 0.045139951769088915, "grad_norm": 0.4296875, "grad_norm_var": 0.0008938948313395183, "learning_rate": 0.0001, "loss": 1.7694, "loss/crossentropy": 2.6800838708877563, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.269368477165699, "step": 3023 }, { "epoch": 0.0451548839397039, "grad_norm": 0.35546875, "grad_norm_var": 0.0009356180826822917, "learning_rate": 0.0001, "loss": 1.5472, "loss/crossentropy": 2.550456166267395, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19562239944934845, "step": 3024 }, { "epoch": 0.045169816110318874, "grad_norm": 0.474609375, "grad_norm_var": 0.0011643568674723308, "learning_rate": 0.0001, "loss": 1.7529, "loss/crossentropy": 2.514010429382324, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.2216210961341858, "step": 3025 }, { "epoch": 0.045184748280933856, "grad_norm": 0.365234375, "grad_norm_var": 0.00123748779296875, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.4983917474746704, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.16899039596319199, "step": 3026 }, { "epoch": 0.04519968045154884, "grad_norm": 0.369140625, "grad_norm_var": 0.0013357639312744141, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.480937361717224, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17107923328876495, "step": 3027 }, { "epoch": 0.04521461262216382, "grad_norm": 0.44921875, "grad_norm_var": 0.0013357639312744141, "learning_rate": 0.0001, "loss": 1.653, "loss/crossentropy": 2.3843698501586914, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.20379211753606796, "step": 3028 }, { "epoch": 0.045229544792778804, "grad_norm": 0.373046875, "grad_norm_var": 0.001271677017211914, "learning_rate": 0.0001, "loss": 1.4778, "loss/crossentropy": 2.7110718488693237, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.180960975587368, "step": 3029 }, { "epoch": 0.04524447696339379, "grad_norm": 0.3984375, "grad_norm_var": 0.0012613296508789062, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.7378227710723877, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.18422307074069977, "step": 3030 }, { "epoch": 0.04525940913400876, "grad_norm": 0.3515625, "grad_norm_var": 0.00146636962890625, "learning_rate": 0.0001, "loss": 1.3905, "loss/crossentropy": 2.7349284887313843, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.16390231996774673, "step": 3031 }, { "epoch": 0.045274341304623746, "grad_norm": 0.421875, "grad_norm_var": 0.0014836629231770833, "learning_rate": 0.0001, "loss": 1.6723, "loss/crossentropy": 2.522131562232971, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2231122851371765, "step": 3032 }, { "epoch": 0.04528927347523873, "grad_norm": 0.4375, "grad_norm_var": 0.0015424092610677083, "learning_rate": 0.0001, "loss": 1.6585, "loss/crossentropy": 2.5267066955566406, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.22492055594921112, "step": 3033 }, { "epoch": 0.04530420564585371, "grad_norm": 0.419921875, "grad_norm_var": 0.0015515486399332683, "learning_rate": 0.0001, "loss": 1.5634, "loss/crossentropy": 2.8208200931549072, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20791849493980408, "step": 3034 }, { "epoch": 0.045319137816468694, "grad_norm": 0.380859375, "grad_norm_var": 0.0015731175740559896, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.5473004579544067, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.2035195305943489, "step": 3035 }, { "epoch": 0.04533406998708367, "grad_norm": 0.416015625, "grad_norm_var": 0.0015024820963541667, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.7393736839294434, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1863822415471077, "step": 3036 }, { "epoch": 0.04534900215769865, "grad_norm": 0.404296875, "grad_norm_var": 0.001289812723795573, "learning_rate": 0.0001, "loss": 1.5702, "loss/crossentropy": 2.4712942838668823, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20690548419952393, "step": 3037 }, { "epoch": 0.045363934328313635, "grad_norm": 0.435546875, "grad_norm_var": 0.0013234297434488933, "learning_rate": 0.0001, "loss": 1.7387, "loss/crossentropy": 2.394981265068054, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.22304615378379822, "step": 3038 }, { "epoch": 0.04537886649892862, "grad_norm": 0.41015625, "grad_norm_var": 0.0012833754221598308, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.5672391653060913, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1853676736354828, "step": 3039 }, { "epoch": 0.0453937986695436, "grad_norm": 0.376953125, "grad_norm_var": 0.00117340087890625, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.679948925971985, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.15346281975507736, "step": 3040 }, { "epoch": 0.04540873084015858, "grad_norm": 0.38671875, "grad_norm_var": 0.000843667984008789, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.6324076652526855, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17320851236581802, "step": 3041 }, { "epoch": 0.04542366301077356, "grad_norm": 0.388671875, "grad_norm_var": 0.0007700443267822266, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.5263367891311646, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1816965565085411, "step": 3042 }, { "epoch": 0.04543859518138854, "grad_norm": 0.431640625, "grad_norm_var": 0.0007466475168863932, "learning_rate": 0.0001, "loss": 1.5052, "loss/crossentropy": 2.5175379514694214, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.17319486290216446, "step": 3043 }, { "epoch": 0.045453527352003524, "grad_norm": 0.435546875, "grad_norm_var": 0.0006779988606770833, "learning_rate": 0.0001, "loss": 1.6697, "loss/crossentropy": 2.683398485183716, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.22829460352659225, "step": 3044 }, { "epoch": 0.04546845952261851, "grad_norm": 0.404296875, "grad_norm_var": 0.00060882568359375, "learning_rate": 0.0001, "loss": 1.6782, "loss/crossentropy": 2.6843109130859375, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.22896407544612885, "step": 3045 }, { "epoch": 0.04548339169323349, "grad_norm": 0.466796875, "grad_norm_var": 0.0008296807607014974, "learning_rate": 0.0001, "loss": 1.6085, "loss/crossentropy": 2.9736781120300293, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.22177670896053314, "step": 3046 }, { "epoch": 0.045498323863848465, "grad_norm": 0.6796875, "grad_norm_var": 0.0049793084462483725, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.6579824686050415, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.19063197076320648, "step": 3047 }, { "epoch": 0.04551325603446345, "grad_norm": 0.376953125, "grad_norm_var": 0.005160268147786458, "learning_rate": 0.0001, "loss": 1.573, "loss/crossentropy": 2.4792014360427856, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2057751938700676, "step": 3048 }, { "epoch": 0.04552818820507843, "grad_norm": 0.42578125, "grad_norm_var": 0.00515435536702474, "learning_rate": 0.0001, "loss": 1.518, "loss/crossentropy": 2.4952834844589233, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18592575937509537, "step": 3049 }, { "epoch": 0.04554312037569341, "grad_norm": 0.408203125, "grad_norm_var": 0.005174763997395833, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.902143359184265, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.22599975764751434, "step": 3050 }, { "epoch": 0.045558052546308396, "grad_norm": 0.423828125, "grad_norm_var": 0.005027198791503906, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.687128782272339, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.16876815259456635, "step": 3051 }, { "epoch": 0.04557298471692338, "grad_norm": 0.365234375, "grad_norm_var": 0.0052792867024739586, "learning_rate": 0.0001, "loss": 1.579, "loss/crossentropy": 2.5076926946640015, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.19619326293468475, "step": 3052 }, { "epoch": 0.045587916887538354, "grad_norm": 0.45703125, "grad_norm_var": 0.005298598607381185, "learning_rate": 0.0001, "loss": 1.7035, "loss/crossentropy": 2.7137664556503296, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.23079677671194077, "step": 3053 }, { "epoch": 0.04560284905815334, "grad_norm": 0.34765625, "grad_norm_var": 0.00571130116780599, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.5252299308776855, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.179414264857769, "step": 3054 }, { "epoch": 0.04561778122876832, "grad_norm": 0.396484375, "grad_norm_var": 0.005748351414998372, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.543508529663086, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.15379596501588821, "step": 3055 }, { "epoch": 0.0456327133993833, "grad_norm": 0.380859375, "grad_norm_var": 0.005725208918253581, "learning_rate": 0.0001, "loss": 1.406, "loss/crossentropy": 2.7750244140625, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.16775666922330856, "step": 3056 }, { "epoch": 0.045647645569998285, "grad_norm": 0.39453125, "grad_norm_var": 0.005690749486287435, "learning_rate": 0.0001, "loss": 1.4787, "loss/crossentropy": 2.7191981077194214, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1896023005247116, "step": 3057 }, { "epoch": 0.04566257774061326, "grad_norm": 0.396484375, "grad_norm_var": 0.005657815933227539, "learning_rate": 0.0001, "loss": 1.6016, "loss/crossentropy": 2.432904601097107, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.1992526724934578, "step": 3058 }, { "epoch": 0.045677509911228244, "grad_norm": 0.3671875, "grad_norm_var": 0.005855560302734375, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.575082302093506, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18884506821632385, "step": 3059 }, { "epoch": 0.045692442081843226, "grad_norm": 0.40625, "grad_norm_var": 0.005850076675415039, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.522244930267334, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1743082031607628, "step": 3060 }, { "epoch": 0.04570737425245821, "grad_norm": 0.357421875, "grad_norm_var": 0.006076669692993164, "learning_rate": 0.0001, "loss": 1.6317, "loss/crossentropy": 2.359546184539795, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.19814765453338623, "step": 3061 }, { "epoch": 0.04572230642307319, "grad_norm": 0.35546875, "grad_norm_var": 0.006092071533203125, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.6154147386550903, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.17758549004793167, "step": 3062 }, { "epoch": 0.045737238593688175, "grad_norm": 0.42578125, "grad_norm_var": 0.0009469985961914062, "learning_rate": 0.0001, "loss": 1.6156, "loss/crossentropy": 2.6313316822052, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.20152980089187622, "step": 3063 }, { "epoch": 0.04575217076430315, "grad_norm": 0.41015625, "grad_norm_var": 0.0009456475575764974, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.619558334350586, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1827666312456131, "step": 3064 }, { "epoch": 0.04576710293491813, "grad_norm": 0.35546875, "grad_norm_var": 0.0009651025136311849, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.3638755083084106, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1808125004172325, "step": 3065 }, { "epoch": 0.045782035105533116, "grad_norm": 0.408203125, "grad_norm_var": 0.0009651025136311849, "learning_rate": 0.0001, "loss": 1.664, "loss/crossentropy": 2.6187846660614014, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2577797695994377, "step": 3066 }, { "epoch": 0.0457969672761481, "grad_norm": 0.421875, "grad_norm_var": 0.0009566624959309896, "learning_rate": 0.0001, "loss": 1.8222, "loss/crossentropy": 2.3348960876464844, "loss/fcd": 1.5546875, "loss/idx": 9.0, "loss/logits": 0.2675005793571472, "step": 3067 }, { "epoch": 0.04581189944676308, "grad_norm": 0.404296875, "grad_norm_var": 0.0009210586547851562, "learning_rate": 0.0001, "loss": 1.535, "loss/crossentropy": 2.4353928565979004, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18737050145864487, "step": 3068 }, { "epoch": 0.04582683161737806, "grad_norm": 0.359375, "grad_norm_var": 0.0006810506184895833, "learning_rate": 0.0001, "loss": 1.3855, "loss/crossentropy": 2.5012736320495605, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.15112649649381638, "step": 3069 }, { "epoch": 0.04584176378799304, "grad_norm": 0.361328125, "grad_norm_var": 0.0006215254465738933, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.4983397722244263, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.190759114921093, "step": 3070 }, { "epoch": 0.04585669595860802, "grad_norm": 0.392578125, "grad_norm_var": 0.0006178379058837891, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.6544697284698486, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20709017664194107, "step": 3071 }, { "epoch": 0.045871628129223005, "grad_norm": 0.412109375, "grad_norm_var": 0.0006519158681233724, "learning_rate": 0.0001, "loss": 1.5603, "loss/crossentropy": 2.6285094022750854, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.21650660037994385, "step": 3072 }, { "epoch": 0.04588656029983799, "grad_norm": 0.33984375, "grad_norm_var": 0.0008005619049072266, "learning_rate": 0.0001, "loss": 1.5398, "loss/crossentropy": 2.4827815294265747, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20390672981739044, "step": 3073 }, { "epoch": 0.04590149247045297, "grad_norm": 0.3828125, "grad_norm_var": 0.0007928848266601562, "learning_rate": 0.0001, "loss": 1.583, "loss/crossentropy": 2.5244340896606445, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.204135000705719, "step": 3074 }, { "epoch": 0.045916424641067946, "grad_norm": 0.40625, "grad_norm_var": 0.000795427958170573, "learning_rate": 0.0001, "loss": 1.6046, "loss/crossentropy": 2.8219584226608276, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.22568777948617935, "step": 3075 }, { "epoch": 0.04593135681168293, "grad_norm": 0.341796875, "grad_norm_var": 0.0008935133616129557, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.566993474960327, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.16324107348918915, "step": 3076 }, { "epoch": 0.04594628898229791, "grad_norm": 0.400390625, "grad_norm_var": 0.0008599440256754558, "learning_rate": 0.0001, "loss": 1.4627, "loss/crossentropy": 2.7115864753723145, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17756866663694382, "step": 3077 }, { "epoch": 0.045961221152912894, "grad_norm": 0.42578125, "grad_norm_var": 0.0008816878000895182, "learning_rate": 0.0001, "loss": 1.6148, "loss/crossentropy": 2.6270207166671753, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.22029897570610046, "step": 3078 }, { "epoch": 0.04597615332352788, "grad_norm": 0.412109375, "grad_norm_var": 0.0008290608723958334, "learning_rate": 0.0001, "loss": 1.4424, "loss/crossentropy": 2.833571434020996, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17675615847110748, "step": 3079 }, { "epoch": 0.04599108549414285, "grad_norm": 0.3671875, "grad_norm_var": 0.0008269627888997396, "learning_rate": 0.0001, "loss": 1.541, "loss/crossentropy": 2.409952163696289, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20117250829935074, "step": 3080 }, { "epoch": 0.046006017664757835, "grad_norm": 0.40234375, "grad_norm_var": 0.0007674535115559896, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.874748706817627, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.16657818853855133, "step": 3081 }, { "epoch": 0.04602094983537282, "grad_norm": 0.486328125, "grad_norm_var": 0.0013396581013997396, "learning_rate": 0.0001, "loss": 1.6579, "loss/crossentropy": 2.571967601776123, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.22825753688812256, "step": 3082 }, { "epoch": 0.0460358820059878, "grad_norm": 0.36328125, "grad_norm_var": 0.0013425191243489583, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.691252827644348, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.2046958953142166, "step": 3083 }, { "epoch": 0.04605081417660278, "grad_norm": 0.37109375, "grad_norm_var": 0.0013530572255452474, "learning_rate": 0.0001, "loss": 1.5817, "loss/crossentropy": 2.7040563821792603, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.1988684982061386, "step": 3084 }, { "epoch": 0.046065746347217766, "grad_norm": 0.392578125, "grad_norm_var": 0.0012906392415364583, "learning_rate": 0.0001, "loss": 1.6434, "loss/crossentropy": 2.5715852975845337, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2058606594800949, "step": 3085 }, { "epoch": 0.04608067851783274, "grad_norm": 0.376953125, "grad_norm_var": 0.0012438456217447917, "learning_rate": 0.0001, "loss": 1.5631, "loss/crossentropy": 2.52947735786438, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20368285477161407, "step": 3086 }, { "epoch": 0.046095610688447725, "grad_norm": 0.40625, "grad_norm_var": 0.001256418228149414, "learning_rate": 0.0001, "loss": 1.6061, "loss/crossentropy": 2.36742901802063, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.20372359454631805, "step": 3087 }, { "epoch": 0.04611054285906271, "grad_norm": 0.498046875, "grad_norm_var": 0.0019375960032145183, "learning_rate": 0.0001, "loss": 1.6998, "loss/crossentropy": 2.5441629886627197, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2349698841571808, "step": 3088 }, { "epoch": 0.04612547502967769, "grad_norm": 0.43359375, "grad_norm_var": 0.0017560164133707682, "learning_rate": 0.0001, "loss": 1.7842, "loss/crossentropy": 2.3730982542037964, "loss/fcd": 1.54296875, "loss/idx": 9.0, "loss/logits": 0.24127189815044403, "step": 3089 }, { "epoch": 0.04614040720029267, "grad_norm": 0.365234375, "grad_norm_var": 0.0018253962198893229, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.7170838117599487, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1865038499236107, "step": 3090 }, { "epoch": 0.046155339370907655, "grad_norm": 0.443359375, "grad_norm_var": 0.0019271691640218099, "learning_rate": 0.0001, "loss": 1.5604, "loss/crossentropy": 2.745132088661194, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19714965671300888, "step": 3091 }, { "epoch": 0.04617027154152263, "grad_norm": 0.43359375, "grad_norm_var": 0.0016754150390625, "learning_rate": 0.0001, "loss": 1.7074, "loss/crossentropy": 2.6806834936141968, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.2542722672224045, "step": 3092 }, { "epoch": 0.046185203712137614, "grad_norm": 0.474609375, "grad_norm_var": 0.0019133885701497395, "learning_rate": 0.0001, "loss": 1.8014, "loss/crossentropy": 2.5400713682174683, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.23894117772579193, "step": 3093 }, { "epoch": 0.046200135882752597, "grad_norm": 0.419921875, "grad_norm_var": 0.0019077142079671225, "learning_rate": 0.0001, "loss": 1.6188, "loss/crossentropy": 2.817517876625061, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.22034704685211182, "step": 3094 }, { "epoch": 0.04621506805336758, "grad_norm": 0.423828125, "grad_norm_var": 0.001911147435506185, "learning_rate": 0.0001, "loss": 1.5748, "loss/crossentropy": 2.8088892698287964, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21151988208293915, "step": 3095 }, { "epoch": 0.04623000022398256, "grad_norm": 0.376953125, "grad_norm_var": 0.0018533706665039063, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.5986393690109253, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.20085880160331726, "step": 3096 }, { "epoch": 0.04624493239459754, "grad_norm": 0.373046875, "grad_norm_var": 0.0019632816314697266, "learning_rate": 0.0001, "loss": 1.5221, "loss/crossentropy": 2.592566728591919, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19791093468666077, "step": 3097 }, { "epoch": 0.04625986456521252, "grad_norm": 0.376953125, "grad_norm_var": 0.0016695499420166016, "learning_rate": 0.0001, "loss": 1.55, "loss/crossentropy": 2.5289684534072876, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1906643807888031, "step": 3098 }, { "epoch": 0.0462747967358275, "grad_norm": 0.396484375, "grad_norm_var": 0.0015401204427083334, "learning_rate": 0.0001, "loss": 1.6267, "loss/crossentropy": 2.473766565322876, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.20482122153043747, "step": 3099 }, { "epoch": 0.046289728906442486, "grad_norm": 0.392578125, "grad_norm_var": 0.001457071304321289, "learning_rate": 0.0001, "loss": 1.4355, "loss/crossentropy": 2.4301387071609497, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.16209951043128967, "step": 3100 }, { "epoch": 0.04630466107705747, "grad_norm": 0.546875, "grad_norm_var": 0.0025557835896809896, "learning_rate": 0.0001, "loss": 2.1823, "loss/crossentropy": 2.738608717918396, "loss/fcd": 1.79296875, "loss/idx": 9.0, "loss/logits": 0.3893527388572693, "step": 3101 }, { "epoch": 0.04631959324767245, "grad_norm": 0.6328125, "grad_norm_var": 0.005139780044555664, "learning_rate": 0.0001, "loss": 1.6568, "loss/crossentropy": 2.3933111429214478, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.18028242886066437, "step": 3102 }, { "epoch": 0.04633452541828743, "grad_norm": 0.412109375, "grad_norm_var": 0.0051177978515625, "learning_rate": 0.0001, "loss": 1.5334, "loss/crossentropy": 2.4562193155288696, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.1974559873342514, "step": 3103 }, { "epoch": 0.04634945758890241, "grad_norm": 0.375, "grad_norm_var": 0.005070734024047852, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.457727313041687, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19088391214609146, "step": 3104 }, { "epoch": 0.04636438975951739, "grad_norm": 0.4140625, "grad_norm_var": 0.005084721247355143, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.5080443620681763, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19383231550455093, "step": 3105 }, { "epoch": 0.046379321930132375, "grad_norm": 0.51953125, "grad_norm_var": 0.005269304911295573, "learning_rate": 0.0001, "loss": 1.6475, "loss/crossentropy": 2.5375393629074097, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.2451617494225502, "step": 3106 }, { "epoch": 0.04639425410074736, "grad_norm": 0.37109375, "grad_norm_var": 0.005546299616495768, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6089437007904053, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1575084775686264, "step": 3107 }, { "epoch": 0.046409186271362333, "grad_norm": 0.41796875, "grad_norm_var": 0.00556181271870931, "learning_rate": 0.0001, "loss": 1.6273, "loss/crossentropy": 2.4994771480560303, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2327336072921753, "step": 3108 }, { "epoch": 0.046424118441977316, "grad_norm": 0.38671875, "grad_norm_var": 0.0055539449055989586, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.617721438407898, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1802249774336815, "step": 3109 }, { "epoch": 0.0464390506125923, "grad_norm": 0.384765625, "grad_norm_var": 0.00566552480061849, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.61292564868927, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18648821860551834, "step": 3110 }, { "epoch": 0.04645398278320728, "grad_norm": 0.53125, "grad_norm_var": 0.006369256973266601, "learning_rate": 0.0001, "loss": 1.9695, "loss/crossentropy": 2.6482081413269043, "loss/fcd": 1.65625, "loss/idx": 9.0, "loss/logits": 0.3132621943950653, "step": 3111 }, { "epoch": 0.046468914953822264, "grad_norm": 0.46875, "grad_norm_var": 0.006225077311197916, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.5968077182769775, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.17954562604427338, "step": 3112 }, { "epoch": 0.04648384712443725, "grad_norm": 0.369140625, "grad_norm_var": 0.006259600321451823, "learning_rate": 0.0001, "loss": 1.6092, "loss/crossentropy": 2.429466962814331, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.20681451261043549, "step": 3113 }, { "epoch": 0.04649877929505222, "grad_norm": 0.365234375, "grad_norm_var": 0.006362406412760416, "learning_rate": 0.0001, "loss": 1.5524, "loss/crossentropy": 2.399208426475525, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.2008572816848755, "step": 3114 }, { "epoch": 0.046513711465667205, "grad_norm": 0.42578125, "grad_norm_var": 0.006259648005167643, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.682990550994873, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18112893402576447, "step": 3115 }, { "epoch": 0.04652864363628219, "grad_norm": 0.3828125, "grad_norm_var": 0.0063252131144205725, "learning_rate": 0.0001, "loss": 1.4984, "loss/crossentropy": 2.471660852432251, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.17810578644275665, "step": 3116 }, { "epoch": 0.04654357580689717, "grad_norm": 0.404296875, "grad_norm_var": 0.005521122614542643, "learning_rate": 0.0001, "loss": 1.593, "loss/crossentropy": 2.9036530256271362, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.21408919245004654, "step": 3117 }, { "epoch": 0.046558507977512154, "grad_norm": 0.4296875, "grad_norm_var": 0.0025754133860270184, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.564030885696411, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1697898879647255, "step": 3118 }, { "epoch": 0.04657344014812713, "grad_norm": 0.474609375, "grad_norm_var": 0.002785984675089518, "learning_rate": 0.0001, "loss": 1.6343, "loss/crossentropy": 2.677256941795349, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.19676363468170166, "step": 3119 }, { "epoch": 0.04658837231874211, "grad_norm": 0.4609375, "grad_norm_var": 0.002731434504191081, "learning_rate": 0.0001, "loss": 1.8177, "loss/crossentropy": 2.602780818939209, "loss/fcd": 1.57421875, "loss/idx": 9.0, "loss/logits": 0.24349943548440933, "step": 3120 }, { "epoch": 0.046603304489357095, "grad_norm": 0.4453125, "grad_norm_var": 0.002745167414347331, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.8656243085861206, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.16568725556135178, "step": 3121 }, { "epoch": 0.04661823665997208, "grad_norm": 0.376953125, "grad_norm_var": 0.002263641357421875, "learning_rate": 0.0001, "loss": 1.5283, "loss/crossentropy": 2.8737783432006836, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.21187427639961243, "step": 3122 }, { "epoch": 0.04663316883058706, "grad_norm": 0.40625, "grad_norm_var": 0.0021188735961914064, "learning_rate": 0.0001, "loss": 1.3002, "loss/crossentropy": 2.7621814012527466, "loss/fcd": 1.16796875, "loss/idx": 9.0, "loss/logits": 0.13220356404781342, "step": 3123 }, { "epoch": 0.04664810100120204, "grad_norm": 0.38671875, "grad_norm_var": 0.0021910985310872396, "learning_rate": 0.0001, "loss": 1.5883, "loss/crossentropy": 2.542693614959717, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20545881986618042, "step": 3124 }, { "epoch": 0.04666303317181702, "grad_norm": 0.376953125, "grad_norm_var": 0.0022387027740478514, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.69387423992157, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19044345617294312, "step": 3125 }, { "epoch": 0.046677965342432, "grad_norm": 0.498046875, "grad_norm_var": 0.002537393569946289, "learning_rate": 0.0001, "loss": 1.5194, "loss/crossentropy": 2.675103545188904, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.1756979152560234, "step": 3126 }, { "epoch": 0.046692897513046984, "grad_norm": 0.3984375, "grad_norm_var": 0.0017613569895426433, "learning_rate": 0.0001, "loss": 1.5629, "loss/crossentropy": 2.4969518184661865, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19184650480747223, "step": 3127 }, { "epoch": 0.04670782968366197, "grad_norm": 0.703125, "grad_norm_var": 0.0068158308664957685, "learning_rate": 0.0001, "loss": 1.9358, "loss/crossentropy": 2.556679606437683, "loss/fcd": 1.625, "loss/idx": 9.0, "loss/logits": 0.3108226954936981, "step": 3128 }, { "epoch": 0.04672276185427695, "grad_norm": 0.67578125, "grad_norm_var": 0.010142262776692708, "learning_rate": 0.0001, "loss": 2.0322, "loss/crossentropy": 2.801246404647827, "loss/fcd": 1.7421875, "loss/idx": 9.0, "loss/logits": 0.2899772524833679, "step": 3129 }, { "epoch": 0.046737694024891925, "grad_norm": 0.435546875, "grad_norm_var": 0.009650166829427083, "learning_rate": 0.0001, "loss": 1.5426, "loss/crossentropy": 2.586599111557007, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20273741334676743, "step": 3130 }, { "epoch": 0.04675262619550691, "grad_norm": 0.408203125, "grad_norm_var": 0.009738143285115559, "learning_rate": 0.0001, "loss": 1.4815, "loss/crossentropy": 2.911854863166809, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18073569238185883, "step": 3131 }, { "epoch": 0.04676755836612189, "grad_norm": 0.5234375, "grad_norm_var": 0.009639724095662435, "learning_rate": 0.0001, "loss": 1.7183, "loss/crossentropy": 2.686639428138733, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.2573806792497635, "step": 3132 }, { "epoch": 0.04678249053673687, "grad_norm": 0.359375, "grad_norm_var": 0.010116068522135417, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.4805655479431152, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.16343840211629868, "step": 3133 }, { "epoch": 0.046797422707351856, "grad_norm": 0.400390625, "grad_norm_var": 0.010287968317667644, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.5608690977096558, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19736528396606445, "step": 3134 }, { "epoch": 0.04681235487796684, "grad_norm": 0.42578125, "grad_norm_var": 0.010329691569010417, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.400082588195801, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20025186240673065, "step": 3135 }, { "epoch": 0.046827287048581814, "grad_norm": 0.43359375, "grad_norm_var": 0.010355059305826824, "learning_rate": 0.0001, "loss": 1.5522, "loss/crossentropy": 2.4945040941238403, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20067646354436874, "step": 3136 }, { "epoch": 0.0468422192191968, "grad_norm": 0.375, "grad_norm_var": 0.010739580790201823, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.629315972328186, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20253190398216248, "step": 3137 }, { "epoch": 0.04685715138981178, "grad_norm": 1.0, "grad_norm_var": 0.029018259048461913, "learning_rate": 0.0001, "loss": 1.7498, "loss/crossentropy": 2.5127376317977905, "loss/fcd": 1.54296875, "loss/idx": 9.0, "loss/logits": 0.2067951038479805, "step": 3138 }, { "epoch": 0.04687208356042676, "grad_norm": 0.416015625, "grad_norm_var": 0.028917884826660155, "learning_rate": 0.0001, "loss": 1.6161, "loss/crossentropy": 2.463278651237488, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.1980830729007721, "step": 3139 }, { "epoch": 0.046887015731041745, "grad_norm": 0.3828125, "grad_norm_var": 0.02897186279296875, "learning_rate": 0.0001, "loss": 1.5637, "loss/crossentropy": 2.5163652896881104, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19646252691745758, "step": 3140 }, { "epoch": 0.04690194790165672, "grad_norm": 0.431640625, "grad_norm_var": 0.028347015380859375, "learning_rate": 0.0001, "loss": 1.7262, "loss/crossentropy": 2.361856698989868, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.2378748208284378, "step": 3141 }, { "epoch": 0.046916880072271704, "grad_norm": 0.453125, "grad_norm_var": 0.028435118993123374, "learning_rate": 0.0001, "loss": 1.6912, "loss/crossentropy": 2.8334085941314697, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.24197349697351456, "step": 3142 }, { "epoch": 0.046931812242886686, "grad_norm": 0.349609375, "grad_norm_var": 0.02917302449544271, "learning_rate": 0.0001, "loss": 1.6316, "loss/crossentropy": 2.341671109199524, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.21755322813987732, "step": 3143 }, { "epoch": 0.04694674441350167, "grad_norm": 0.46484375, "grad_norm_var": 0.025818316141764323, "learning_rate": 0.0001, "loss": 1.4974, "loss/crossentropy": 2.649364709854126, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18493737280368805, "step": 3144 }, { "epoch": 0.04696167658411665, "grad_norm": 0.392578125, "grad_norm_var": 0.023096450169881187, "learning_rate": 0.0001, "loss": 1.4507, "loss/crossentropy": 2.745826005935669, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17724765092134476, "step": 3145 }, { "epoch": 0.046976608754731634, "grad_norm": 0.4140625, "grad_norm_var": 0.023176002502441406, "learning_rate": 0.0001, "loss": 1.6022, "loss/crossentropy": 2.4293630123138428, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.1959889903664589, "step": 3146 }, { "epoch": 0.04699154092534661, "grad_norm": 0.341796875, "grad_norm_var": 0.023838551839192708, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.359114408493042, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17132177203893661, "step": 3147 }, { "epoch": 0.04700647309596159, "grad_norm": 0.36328125, "grad_norm_var": 0.02382551829020182, "learning_rate": 0.0001, "loss": 1.4869, "loss/crossentropy": 2.585848569869995, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18998490273952484, "step": 3148 }, { "epoch": 0.047021405266576576, "grad_norm": 0.35546875, "grad_norm_var": 0.023867289225260418, "learning_rate": 0.0001, "loss": 1.5218, "loss/crossentropy": 2.546360492706299, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1898171752691269, "step": 3149 }, { "epoch": 0.04703633743719156, "grad_norm": 0.419921875, "grad_norm_var": 0.02379449208577474, "learning_rate": 0.0001, "loss": 1.55, "loss/crossentropy": 2.5424532890319824, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19061622768640518, "step": 3150 }, { "epoch": 0.04705126960780654, "grad_norm": 0.375, "grad_norm_var": 0.02404327392578125, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.554916024208069, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17258740961551666, "step": 3151 }, { "epoch": 0.047066201778421524, "grad_norm": 0.3984375, "grad_norm_var": 0.024129676818847656, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.6738905906677246, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.2050549015402794, "step": 3152 }, { "epoch": 0.0470811339490365, "grad_norm": 0.46875, "grad_norm_var": 0.023949623107910156, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.5284717082977295, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.20794843137264252, "step": 3153 }, { "epoch": 0.04709606611965148, "grad_norm": 0.43359375, "grad_norm_var": 0.0016492207845052083, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.6028409004211426, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20274603366851807, "step": 3154 }, { "epoch": 0.047110998290266465, "grad_norm": 0.376953125, "grad_norm_var": 0.0016810099283854167, "learning_rate": 0.0001, "loss": 1.622, "loss/crossentropy": 2.5545225143432617, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2157014161348343, "step": 3155 }, { "epoch": 0.04712593046088145, "grad_norm": 0.5390625, "grad_norm_var": 0.0028203328450520835, "learning_rate": 0.0001, "loss": 1.9583, "loss/crossentropy": 2.3186652660369873, "loss/fcd": 1.6640625, "loss/idx": 9.0, "loss/logits": 0.29425202310085297, "step": 3156 }, { "epoch": 0.04714086263149643, "grad_norm": 0.40234375, "grad_norm_var": 0.00279386838277181, "learning_rate": 0.0001, "loss": 1.4338, "loss/crossentropy": 2.6120704412460327, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.16814450919628143, "step": 3157 }, { "epoch": 0.047155794802111406, "grad_norm": 0.43359375, "grad_norm_var": 0.002703587214152018, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.668149948120117, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1940547153353691, "step": 3158 }, { "epoch": 0.04717072697272639, "grad_norm": 0.408203125, "grad_norm_var": 0.0024613539377848306, "learning_rate": 0.0001, "loss": 1.5472, "loss/crossentropy": 2.4839673042297363, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19559723138809204, "step": 3159 }, { "epoch": 0.04718565914334137, "grad_norm": 0.58203125, "grad_norm_var": 0.0041493574778238935, "learning_rate": 0.0001, "loss": 1.7984, "loss/crossentropy": 2.434971332550049, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.23203948140144348, "step": 3160 }, { "epoch": 0.047200591313956354, "grad_norm": 0.359375, "grad_norm_var": 0.004335530598958333, "learning_rate": 0.0001, "loss": 1.5239, "loss/crossentropy": 2.5454121828079224, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1996612623333931, "step": 3161 }, { "epoch": 0.04721552348457134, "grad_norm": 0.5703125, "grad_norm_var": 0.005800374348958333, "learning_rate": 0.0001, "loss": 1.6926, "loss/crossentropy": 2.6913294792175293, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.23171231150627136, "step": 3162 }, { "epoch": 0.04723045565518632, "grad_norm": 0.373046875, "grad_norm_var": 0.005507405598958333, "learning_rate": 0.0001, "loss": 1.4986, "loss/crossentropy": 2.5074089765548706, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18215472251176834, "step": 3163 }, { "epoch": 0.047245387825801295, "grad_norm": 0.453125, "grad_norm_var": 0.005228106180826823, "learning_rate": 0.0001, "loss": 1.593, "loss/crossentropy": 2.624423861503601, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20241886377334595, "step": 3164 }, { "epoch": 0.04726031999641628, "grad_norm": 0.400390625, "grad_norm_var": 0.004881906509399414, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.435251832008362, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18268324434757233, "step": 3165 }, { "epoch": 0.04727525216703126, "grad_norm": 0.5859375, "grad_norm_var": 0.006223487854003906, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.6356441974639893, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.1927664652466774, "step": 3166 }, { "epoch": 0.04729018433764624, "grad_norm": 0.3984375, "grad_norm_var": 0.006031227111816406, "learning_rate": 0.0001, "loss": 1.6272, "loss/crossentropy": 2.594663619995117, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2092401310801506, "step": 3167 }, { "epoch": 0.047305116508261226, "grad_norm": 0.365234375, "grad_norm_var": 0.006323862075805664, "learning_rate": 0.0001, "loss": 1.4895, "loss/crossentropy": 2.4922765493392944, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17698125541210175, "step": 3168 }, { "epoch": 0.0473200486788762, "grad_norm": 0.37890625, "grad_norm_var": 0.006566603978474935, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.7809557914733887, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1781632900238037, "step": 3169 }, { "epoch": 0.047334980849491184, "grad_norm": 0.451171875, "grad_norm_var": 0.006567891438802083, "learning_rate": 0.0001, "loss": 1.6867, "loss/crossentropy": 2.3744956254959106, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.21402651816606522, "step": 3170 }, { "epoch": 0.04734991302010617, "grad_norm": 0.376953125, "grad_norm_var": 0.006567891438802083, "learning_rate": 0.0001, "loss": 1.5281, "loss/crossentropy": 2.8049256801605225, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19608278572559357, "step": 3171 }, { "epoch": 0.04736484519072115, "grad_norm": 0.70703125, "grad_norm_var": 0.01049645741780599, "learning_rate": 0.0001, "loss": 1.6561, "loss/crossentropy": 3.0284377336502075, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2147204950451851, "step": 3172 }, { "epoch": 0.04737977736133613, "grad_norm": 0.443359375, "grad_norm_var": 0.010325225194295247, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.440250515937805, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.20364370197057724, "step": 3173 }, { "epoch": 0.047394709531951115, "grad_norm": 0.392578125, "grad_norm_var": 0.01054986317952474, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.5995298624038696, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16273919492959976, "step": 3174 }, { "epoch": 0.04740964170256609, "grad_norm": 0.375, "grad_norm_var": 0.010816558202107748, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.696618437767029, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.24626919627189636, "step": 3175 }, { "epoch": 0.047424573873181074, "grad_norm": 0.341796875, "grad_norm_var": 0.01022027333577474, "learning_rate": 0.0001, "loss": 1.4064, "loss/crossentropy": 2.570863366127014, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.17205122113227844, "step": 3176 }, { "epoch": 0.047439506043796056, "grad_norm": 0.408203125, "grad_norm_var": 0.009871784845987957, "learning_rate": 0.0001, "loss": 1.6431, "loss/crossentropy": 2.6167795658111572, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.19778906553983688, "step": 3177 }, { "epoch": 0.04745443821441104, "grad_norm": 0.48046875, "grad_norm_var": 0.00880138079325358, "learning_rate": 0.0001, "loss": 1.8536, "loss/crossentropy": 2.714370608329773, "loss/fcd": 1.62109375, "loss/idx": 9.0, "loss/logits": 0.23248600959777832, "step": 3178 }, { "epoch": 0.04746937038502602, "grad_norm": 0.3828125, "grad_norm_var": 0.008728981018066406, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.6571682691574097, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22390927374362946, "step": 3179 }, { "epoch": 0.047484302555641, "grad_norm": 0.390625, "grad_norm_var": 0.008812395731608073, "learning_rate": 0.0001, "loss": 1.4362, "loss/crossentropy": 2.784103274345398, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17055723816156387, "step": 3180 }, { "epoch": 0.04749923472625598, "grad_norm": 0.359375, "grad_norm_var": 0.00907909075419108, "learning_rate": 0.0001, "loss": 1.4331, "loss/crossentropy": 2.5748485326766968, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.15966461598873138, "step": 3181 }, { "epoch": 0.04751416689687096, "grad_norm": 0.40625, "grad_norm_var": 0.007298008600870768, "learning_rate": 0.0001, "loss": 1.5107, "loss/crossentropy": 2.8203163146972656, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19035401940345764, "step": 3182 }, { "epoch": 0.047529099067485946, "grad_norm": 0.490234375, "grad_norm_var": 0.0076080322265625, "learning_rate": 0.0001, "loss": 1.8042, "loss/crossentropy": 2.927725315093994, "loss/fcd": 1.5390625, "loss/idx": 9.0, "loss/logits": 0.26510997861623764, "step": 3183 }, { "epoch": 0.04754403123810093, "grad_norm": 0.41015625, "grad_norm_var": 0.007394901911417643, "learning_rate": 0.0001, "loss": 1.5423, "loss/crossentropy": 2.527675151824951, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19077805429697037, "step": 3184 }, { "epoch": 0.04755896340871591, "grad_norm": 0.4140625, "grad_norm_var": 0.007257572809855143, "learning_rate": 0.0001, "loss": 1.6023, "loss/crossentropy": 2.711413860321045, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21556946635246277, "step": 3185 }, { "epoch": 0.04757389557933089, "grad_norm": 0.431640625, "grad_norm_var": 0.007218154271443685, "learning_rate": 0.0001, "loss": 1.4426, "loss/crossentropy": 2.87962543964386, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17693737149238586, "step": 3186 }, { "epoch": 0.04758882774994587, "grad_norm": 0.388671875, "grad_norm_var": 0.007150634129842123, "learning_rate": 0.0001, "loss": 1.6427, "loss/crossentropy": 2.4622280597686768, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2129702940583229, "step": 3187 }, { "epoch": 0.04760375992056085, "grad_norm": 0.447265625, "grad_norm_var": 0.00164794921875, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.657227873802185, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.22934016585350037, "step": 3188 }, { "epoch": 0.047618692091175835, "grad_norm": 0.404296875, "grad_norm_var": 0.0015703837076822917, "learning_rate": 0.0001, "loss": 1.5152, "loss/crossentropy": 2.5945013761520386, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19489052146673203, "step": 3189 }, { "epoch": 0.04763362426179082, "grad_norm": 0.53515625, "grad_norm_var": 0.0025531609853108725, "learning_rate": 0.0001, "loss": 1.7623, "loss/crossentropy": 2.8040578365325928, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.2310032993555069, "step": 3190 }, { "epoch": 0.04764855643240579, "grad_norm": 0.384765625, "grad_norm_var": 0.002504920959472656, "learning_rate": 0.0001, "loss": 1.5788, "loss/crossentropy": 2.657177686691284, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21160347014665604, "step": 3191 }, { "epoch": 0.047663488603020776, "grad_norm": 0.392578125, "grad_norm_var": 0.002155303955078125, "learning_rate": 0.0001, "loss": 1.5788, "loss/crossentropy": 2.5724059343338013, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.196037195622921, "step": 3192 }, { "epoch": 0.04767842077363576, "grad_norm": 0.443359375, "grad_norm_var": 0.0021753311157226562, "learning_rate": 0.0001, "loss": 1.6265, "loss/crossentropy": 2.5623844861984253, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2045988067984581, "step": 3193 }, { "epoch": 0.04769335294425074, "grad_norm": 0.396484375, "grad_norm_var": 0.001968240737915039, "learning_rate": 0.0001, "loss": 1.4652, "loss/crossentropy": 2.8494330644607544, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18005630373954773, "step": 3194 }, { "epoch": 0.047708285114865724, "grad_norm": 1.1015625, "grad_norm_var": 0.030945189793904624, "learning_rate": 0.0001, "loss": 1.8426, "loss/crossentropy": 2.7874138355255127, "loss/fcd": 1.5703125, "loss/idx": 9.0, "loss/logits": 0.27228844165802, "step": 3195 }, { "epoch": 0.04772321728548071, "grad_norm": 0.416015625, "grad_norm_var": 0.030742899576822916, "learning_rate": 0.0001, "loss": 1.5674, "loss/crossentropy": 2.4446972608566284, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.1923702135682106, "step": 3196 }, { "epoch": 0.04773814945609568, "grad_norm": 0.421875, "grad_norm_var": 0.03011627197265625, "learning_rate": 0.0001, "loss": 1.761, "loss/crossentropy": 2.452430486679077, "loss/fcd": 1.51953125, "loss/idx": 9.0, "loss/logits": 0.2414516806602478, "step": 3197 }, { "epoch": 0.047753081626710665, "grad_norm": 0.40625, "grad_norm_var": 0.03011627197265625, "learning_rate": 0.0001, "loss": 1.4917, "loss/crossentropy": 2.4592024087905884, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1752459481358528, "step": 3198 }, { "epoch": 0.04776801379732565, "grad_norm": 0.41015625, "grad_norm_var": 0.030277236302693685, "learning_rate": 0.0001, "loss": 1.5725, "loss/crossentropy": 2.6462767124176025, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20526786148548126, "step": 3199 }, { "epoch": 0.04778294596794063, "grad_norm": 0.353515625, "grad_norm_var": 0.03087507883707682, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.7943620681762695, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1928454488515854, "step": 3200 }, { "epoch": 0.04779787813855561, "grad_norm": 0.38671875, "grad_norm_var": 0.031086476643880208, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.7485281229019165, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.18879586458206177, "step": 3201 }, { "epoch": 0.04781281030917059, "grad_norm": 0.416015625, "grad_norm_var": 0.03115564982096354, "learning_rate": 0.0001, "loss": 1.5901, "loss/crossentropy": 2.496340036392212, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.21898765116930008, "step": 3202 }, { "epoch": 0.04782774247978557, "grad_norm": 0.37890625, "grad_norm_var": 0.03124998410542806, "learning_rate": 0.0001, "loss": 1.702, "loss/crossentropy": 2.3052037358283997, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.24105000495910645, "step": 3203 }, { "epoch": 0.047842674650400555, "grad_norm": 0.69140625, "grad_norm_var": 0.034693145751953126, "learning_rate": 0.0001, "loss": 1.9483, "loss/crossentropy": 2.244235873222351, "loss/fcd": 1.71875, "loss/idx": 9.0, "loss/logits": 0.22959135472774506, "step": 3204 }, { "epoch": 0.04785760682101554, "grad_norm": 0.546875, "grad_norm_var": 0.0346919854482015, "learning_rate": 0.0001, "loss": 1.7851, "loss/crossentropy": 2.290521800518036, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.24998635798692703, "step": 3205 }, { "epoch": 0.04787253899163052, "grad_norm": 0.447265625, "grad_norm_var": 0.034529622395833334, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.4127440452575684, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.21001601219177246, "step": 3206 }, { "epoch": 0.0478874711622455, "grad_norm": 0.47265625, "grad_norm_var": 0.0339595635732015, "learning_rate": 0.0001, "loss": 1.701, "loss/crossentropy": 2.5913838148117065, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.21665877848863602, "step": 3207 }, { "epoch": 0.04790240333286048, "grad_norm": 0.43359375, "grad_norm_var": 0.03358605702718099, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.7952643632888794, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19472820311784744, "step": 3208 }, { "epoch": 0.04791733550347546, "grad_norm": 0.39453125, "grad_norm_var": 0.033990971247355145, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.4923447370529175, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17365411669015884, "step": 3209 }, { "epoch": 0.047932267674090444, "grad_norm": 0.3515625, "grad_norm_var": 0.03461500803629557, "learning_rate": 0.0001, "loss": 1.5595, "loss/crossentropy": 2.323580265045166, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20793946087360382, "step": 3210 }, { "epoch": 0.047947199844705427, "grad_norm": 0.349609375, "grad_norm_var": 0.007316319147745768, "learning_rate": 0.0001, "loss": 1.5749, "loss/crossentropy": 2.7429014444351196, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2155703753232956, "step": 3211 }, { "epoch": 0.04796213201532041, "grad_norm": 0.349609375, "grad_norm_var": 0.007714064915974935, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.413248062133789, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17514200508594513, "step": 3212 }, { "epoch": 0.04797706418593539, "grad_norm": 0.3984375, "grad_norm_var": 0.0077602227528889975, "learning_rate": 0.0001, "loss": 1.5723, "loss/crossentropy": 2.6674329042434692, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2012316882610321, "step": 3213 }, { "epoch": 0.04799199635655037, "grad_norm": 0.4375, "grad_norm_var": 0.007746489842732748, "learning_rate": 0.0001, "loss": 1.8468, "loss/crossentropy": 2.3436938524246216, "loss/fcd": 1.58984375, "loss/idx": 9.0, "loss/logits": 0.25691723078489304, "step": 3214 }, { "epoch": 0.04800692852716535, "grad_norm": 0.38671875, "grad_norm_var": 0.007830794652303059, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.6097155809402466, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18386971950531006, "step": 3215 }, { "epoch": 0.04802186069778033, "grad_norm": 0.373046875, "grad_norm_var": 0.007669305801391602, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.838638424873352, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17716633528470993, "step": 3216 }, { "epoch": 0.048036792868395316, "grad_norm": 0.373046875, "grad_norm_var": 0.007752418518066406, "learning_rate": 0.0001, "loss": 1.5657, "loss/crossentropy": 2.538647770881653, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.21415185928344727, "step": 3217 }, { "epoch": 0.0480517250390103, "grad_norm": 0.380859375, "grad_norm_var": 0.00787200927734375, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.714820981025696, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1628197431564331, "step": 3218 }, { "epoch": 0.048066657209625274, "grad_norm": 0.3984375, "grad_norm_var": 0.007781410217285156, "learning_rate": 0.0001, "loss": 1.5246, "loss/crossentropy": 2.891999840736389, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18870051950216293, "step": 3219 }, { "epoch": 0.04808158938024026, "grad_norm": 0.3828125, "grad_norm_var": 0.002733612060546875, "learning_rate": 0.0001, "loss": 1.4146, "loss/crossentropy": 2.6945918798446655, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.16852380335330963, "step": 3220 }, { "epoch": 0.04809652155085524, "grad_norm": 0.423828125, "grad_norm_var": 0.001348733901977539, "learning_rate": 0.0001, "loss": 1.658, "loss/crossentropy": 2.518216133117676, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.21655705571174622, "step": 3221 }, { "epoch": 0.04811145372147022, "grad_norm": 0.455078125, "grad_norm_var": 0.0014048099517822265, "learning_rate": 0.0001, "loss": 1.7431, "loss/crossentropy": 2.5469448566436768, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.21188458055257797, "step": 3222 }, { "epoch": 0.048126385892085205, "grad_norm": 0.373046875, "grad_norm_var": 0.0010278701782226562, "learning_rate": 0.0001, "loss": 1.4936, "loss/crossentropy": 2.5482213497161865, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.19283410161733627, "step": 3223 }, { "epoch": 0.04814131806270019, "grad_norm": 0.369140625, "grad_norm_var": 0.0009245395660400391, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.547440767288208, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18166812509298325, "step": 3224 }, { "epoch": 0.04815625023331516, "grad_norm": 0.388671875, "grad_norm_var": 0.0009210586547851562, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.733353018760681, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.15746179223060608, "step": 3225 }, { "epoch": 0.048171182403930146, "grad_norm": 0.384765625, "grad_norm_var": 0.0008332411448160808, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.4307159185409546, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18506229668855667, "step": 3226 }, { "epoch": 0.04818611457454513, "grad_norm": 0.38671875, "grad_norm_var": 0.0007242202758789062, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.5354355573654175, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2028021663427353, "step": 3227 }, { "epoch": 0.04820104674516011, "grad_norm": 0.431640625, "grad_norm_var": 0.00068817138671875, "learning_rate": 0.0001, "loss": 1.4969, "loss/crossentropy": 2.5665853023529053, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.164893239736557, "step": 3228 }, { "epoch": 0.048215978915775094, "grad_norm": 0.3984375, "grad_norm_var": 0.00068817138671875, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.582373023033142, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1899520754814148, "step": 3229 }, { "epoch": 0.04823091108639007, "grad_norm": 0.421875, "grad_norm_var": 0.00061798095703125, "learning_rate": 0.0001, "loss": 1.5314, "loss/crossentropy": 2.479655146598816, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18764080107212067, "step": 3230 }, { "epoch": 0.04824584325700505, "grad_norm": 0.37890625, "grad_norm_var": 0.000630950927734375, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.584221839904785, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.23002317547798157, "step": 3231 }, { "epoch": 0.048260775427620035, "grad_norm": 0.4140625, "grad_norm_var": 0.0006159305572509766, "learning_rate": 0.0001, "loss": 1.5652, "loss/crossentropy": 2.521495819091797, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19805461913347244, "step": 3232 }, { "epoch": 0.04827570759823502, "grad_norm": 0.416015625, "grad_norm_var": 0.0005907535552978516, "learning_rate": 0.0001, "loss": 1.4862, "loss/crossentropy": 2.4999505281448364, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18154353648424149, "step": 3233 }, { "epoch": 0.04829063976885, "grad_norm": 0.3828125, "grad_norm_var": 0.0005859375, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.6002577543258667, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1862766146659851, "step": 3234 }, { "epoch": 0.048305571939464984, "grad_norm": 0.357421875, "grad_norm_var": 0.0007017612457275391, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.6865028142929077, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1583971232175827, "step": 3235 }, { "epoch": 0.04832050411007996, "grad_norm": 0.47265625, "grad_norm_var": 0.0010263919830322266, "learning_rate": 0.0001, "loss": 1.6535, "loss/crossentropy": 2.6959011554718018, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.23162022978067398, "step": 3236 }, { "epoch": 0.04833543628069494, "grad_norm": 0.46484375, "grad_norm_var": 0.0012430191040039063, "learning_rate": 0.0001, "loss": 1.7785, "loss/crossentropy": 2.8164443969726562, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.25115057826042175, "step": 3237 }, { "epoch": 0.048350368451309925, "grad_norm": 0.4140625, "grad_norm_var": 0.0010797977447509766, "learning_rate": 0.0001, "loss": 1.512, "loss/crossentropy": 2.718519449234009, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18386279046535492, "step": 3238 }, { "epoch": 0.04836530062192491, "grad_norm": 0.37890625, "grad_norm_var": 0.001058197021484375, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 2.6961898803710938, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16649317741394043, "step": 3239 }, { "epoch": 0.04838023279253989, "grad_norm": 0.41015625, "grad_norm_var": 0.0009737491607666015, "learning_rate": 0.0001, "loss": 1.5409, "loss/crossentropy": 2.6049740314483643, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.17375807464122772, "step": 3240 }, { "epoch": 0.048395164963154866, "grad_norm": 0.392578125, "grad_norm_var": 0.0009654839833577474, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.707767605781555, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16546224802732468, "step": 3241 }, { "epoch": 0.04841009713376985, "grad_norm": 0.361328125, "grad_norm_var": 0.00106809933980306, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.561444044113159, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.14960087835788727, "step": 3242 }, { "epoch": 0.04842502930438483, "grad_norm": 0.447265625, "grad_norm_var": 0.0011484146118164063, "learning_rate": 0.0001, "loss": 1.8566, "loss/crossentropy": 2.267409086227417, "loss/fcd": 1.62109375, "loss/idx": 9.0, "loss/logits": 0.23554657399654388, "step": 3243 }, { "epoch": 0.048439961474999814, "grad_norm": 0.3984375, "grad_norm_var": 0.001116800308227539, "learning_rate": 0.0001, "loss": 1.6027, "loss/crossentropy": 2.5038150548934937, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.19644096493721008, "step": 3244 }, { "epoch": 0.0484548936456148, "grad_norm": 0.353515625, "grad_norm_var": 0.0012933731079101563, "learning_rate": 0.0001, "loss": 1.6395, "loss/crossentropy": 2.448174238204956, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.22543590515851974, "step": 3245 }, { "epoch": 0.04846982581622978, "grad_norm": 0.41015625, "grad_norm_var": 0.00127410888671875, "learning_rate": 0.0001, "loss": 1.5453, "loss/crossentropy": 2.4717395305633545, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.18203365802764893, "step": 3246 }, { "epoch": 0.048484757986844755, "grad_norm": 0.42578125, "grad_norm_var": 0.00125885009765625, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.3568612337112427, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.19450107216835022, "step": 3247 }, { "epoch": 0.04849969015745974, "grad_norm": 0.35546875, "grad_norm_var": 0.0014123916625976562, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.63204026222229, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.16479476541280746, "step": 3248 }, { "epoch": 0.04851462232807472, "grad_norm": 0.369140625, "grad_norm_var": 0.0014657974243164062, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.86312735080719, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1947481334209442, "step": 3249 }, { "epoch": 0.0485295544986897, "grad_norm": 0.4453125, "grad_norm_var": 0.0015695571899414062, "learning_rate": 0.0001, "loss": 1.6465, "loss/crossentropy": 2.634191632270813, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.20118340849876404, "step": 3250 }, { "epoch": 0.048544486669304686, "grad_norm": 0.333984375, "grad_norm_var": 0.0017480850219726562, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.4930338859558105, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.16148003935813904, "step": 3251 }, { "epoch": 0.04855941883991966, "grad_norm": 0.423828125, "grad_norm_var": 0.0014377435048421223, "learning_rate": 0.0001, "loss": 1.6463, "loss/crossentropy": 2.8786908388137817, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.23612572252750397, "step": 3252 }, { "epoch": 0.048574351010534644, "grad_norm": 0.380859375, "grad_norm_var": 0.001141802469889323, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.398237109184265, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19518128782510757, "step": 3253 }, { "epoch": 0.04858928318114963, "grad_norm": 0.43359375, "grad_norm_var": 0.001218414306640625, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.5248372554779053, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17237873375415802, "step": 3254 }, { "epoch": 0.04860421535176461, "grad_norm": 0.384765625, "grad_norm_var": 0.0012079715728759766, "learning_rate": 0.0001, "loss": 1.4321, "loss/crossentropy": 2.599307894706726, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.18205182254314423, "step": 3255 }, { "epoch": 0.04861914752237959, "grad_norm": 0.384765625, "grad_norm_var": 0.001198259989420573, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.6433037519454956, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19896552711725235, "step": 3256 }, { "epoch": 0.048634079692994575, "grad_norm": 0.66015625, "grad_norm_var": 0.005629587173461914, "learning_rate": 0.0001, "loss": 2.4424, "loss/crossentropy": 2.664064884185791, "loss/fcd": 2.0234375, "loss/idx": 9.0, "loss/logits": 0.4189932197332382, "step": 3257 }, { "epoch": 0.04864901186360955, "grad_norm": 0.375, "grad_norm_var": 0.005551592508951823, "learning_rate": 0.0001, "loss": 1.5707, "loss/crossentropy": 2.588623285293579, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20349613577127457, "step": 3258 }, { "epoch": 0.048663944034224534, "grad_norm": 0.4453125, "grad_norm_var": 0.005542484919230143, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 2.6482077836990356, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.1987440213561058, "step": 3259 }, { "epoch": 0.048678876204839516, "grad_norm": 0.4765625, "grad_norm_var": 0.005790440241495768, "learning_rate": 0.0001, "loss": 1.5084, "loss/crossentropy": 2.68269681930542, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.17640207707881927, "step": 3260 }, { "epoch": 0.0486938083754545, "grad_norm": 0.45703125, "grad_norm_var": 0.005595842997233073, "learning_rate": 0.0001, "loss": 1.5994, "loss/crossentropy": 2.6126770973205566, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.19310010969638824, "step": 3261 }, { "epoch": 0.04870874054606948, "grad_norm": 0.330078125, "grad_norm_var": 0.006129566828409831, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.49465274810791, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.19489428400993347, "step": 3262 }, { "epoch": 0.04872367271668446, "grad_norm": 0.40234375, "grad_norm_var": 0.006138340632120768, "learning_rate": 0.0001, "loss": 1.4517, "loss/crossentropy": 2.6894044876098633, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1821824088692665, "step": 3263 }, { "epoch": 0.04873860488729944, "grad_norm": 0.50390625, "grad_norm_var": 0.006314706802368164, "learning_rate": 0.0001, "loss": 1.8105, "loss/crossentropy": 2.576868176460266, "loss/fcd": 1.5546875, "loss/idx": 9.0, "loss/logits": 0.2558211386203766, "step": 3264 }, { "epoch": 0.04875353705791442, "grad_norm": 0.443359375, "grad_norm_var": 0.0061021010080973305, "learning_rate": 0.0001, "loss": 1.5544, "loss/crossentropy": 2.4428821802139282, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1989479809999466, "step": 3265 }, { "epoch": 0.048768469228529406, "grad_norm": 0.376953125, "grad_norm_var": 0.006255086263020833, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.689664602279663, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19668614864349365, "step": 3266 }, { "epoch": 0.04878340139914439, "grad_norm": 0.42578125, "grad_norm_var": 0.005658197402954102, "learning_rate": 0.0001, "loss": 1.6003, "loss/crossentropy": 2.5395193099975586, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.1979643777012825, "step": 3267 }, { "epoch": 0.04879833356975937, "grad_norm": 0.43359375, "grad_norm_var": 0.005654144287109375, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.7576109170913696, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17033130675554276, "step": 3268 }, { "epoch": 0.04881326574037435, "grad_norm": 0.48828125, "grad_norm_var": 0.005641031265258789, "learning_rate": 0.0001, "loss": 1.7123, "loss/crossentropy": 2.3267349004745483, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.20062988996505737, "step": 3269 }, { "epoch": 0.04882819791098933, "grad_norm": 0.431640625, "grad_norm_var": 0.00564263661702474, "learning_rate": 0.0001, "loss": 1.5195, "loss/crossentropy": 2.4494059085845947, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.17578484117984772, "step": 3270 }, { "epoch": 0.04884313008160431, "grad_norm": 0.40234375, "grad_norm_var": 0.005535491307576497, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.522171378135681, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17406661063432693, "step": 3271 }, { "epoch": 0.048858062252219295, "grad_norm": 0.427734375, "grad_norm_var": 0.005335474014282226, "learning_rate": 0.0001, "loss": 1.5654, "loss/crossentropy": 2.6366981267929077, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19819151610136032, "step": 3272 }, { "epoch": 0.04887299442283428, "grad_norm": 0.361328125, "grad_norm_var": 0.00224456787109375, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.4810651540756226, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17358855158090591, "step": 3273 }, { "epoch": 0.04888792659344926, "grad_norm": 0.44921875, "grad_norm_var": 0.0021056493123372396, "learning_rate": 0.0001, "loss": 1.8622, "loss/crossentropy": 2.7075453996658325, "loss/fcd": 1.58984375, "loss/idx": 9.0, "loss/logits": 0.2723177373409271, "step": 3274 }, { "epoch": 0.048902858764064236, "grad_norm": 0.36328125, "grad_norm_var": 0.002341969807942708, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.697046637535095, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.1706179529428482, "step": 3275 }, { "epoch": 0.04891779093467922, "grad_norm": 0.36328125, "grad_norm_var": 0.0023401260375976564, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.6906559467315674, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17492924630641937, "step": 3276 }, { "epoch": 0.0489327231052942, "grad_norm": 0.3828125, "grad_norm_var": 0.002280934651692708, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.365514099597931, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18543267250061035, "step": 3277 }, { "epoch": 0.048947655275909184, "grad_norm": 0.34375, "grad_norm_var": 0.002143971125284831, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.633302927017212, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19341272115707397, "step": 3278 }, { "epoch": 0.04896258744652417, "grad_norm": 0.421875, "grad_norm_var": 0.002141427993774414, "learning_rate": 0.0001, "loss": 1.5273, "loss/crossentropy": 2.49026620388031, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18749425560235977, "step": 3279 }, { "epoch": 0.04897751961713914, "grad_norm": 0.3515625, "grad_norm_var": 0.0017595767974853515, "learning_rate": 0.0001, "loss": 1.5103, "loss/crossentropy": 2.637345790863037, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.20169149339199066, "step": 3280 }, { "epoch": 0.048992451787754125, "grad_norm": 0.375, "grad_norm_var": 0.001694488525390625, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.6466565132141113, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18720076978206635, "step": 3281 }, { "epoch": 0.04900738395836911, "grad_norm": 0.36328125, "grad_norm_var": 0.0017480055491129558, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.6036797761917114, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19911248981952667, "step": 3282 }, { "epoch": 0.04902231612898409, "grad_norm": 0.462890625, "grad_norm_var": 0.0019663492838541668, "learning_rate": 0.0001, "loss": 1.6948, "loss/crossentropy": 2.4149054288864136, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.22215893864631653, "step": 3283 }, { "epoch": 0.04903724829959907, "grad_norm": 0.369140625, "grad_norm_var": 0.0019490400950113933, "learning_rate": 0.0001, "loss": 1.5666, "loss/crossentropy": 2.56683611869812, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20329075306653976, "step": 3284 }, { "epoch": 0.049052180470214056, "grad_norm": 0.396484375, "grad_norm_var": 0.00136260986328125, "learning_rate": 0.0001, "loss": 1.6232, "loss/crossentropy": 2.6353565454483032, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2287018746137619, "step": 3285 }, { "epoch": 0.04906711264082903, "grad_norm": 0.373046875, "grad_norm_var": 0.0012643814086914062, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.670197010040283, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.18023426085710526, "step": 3286 }, { "epoch": 0.049082044811444014, "grad_norm": 0.453125, "grad_norm_var": 0.0015230814615885416, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.5358892679214478, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.15956322848796844, "step": 3287 }, { "epoch": 0.049096976982059, "grad_norm": 0.37890625, "grad_norm_var": 0.0014336744944254557, "learning_rate": 0.0001, "loss": 1.6924, "loss/crossentropy": 2.5829485654830933, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2431444376707077, "step": 3288 }, { "epoch": 0.04911190915267398, "grad_norm": 0.416015625, "grad_norm_var": 0.0014256636301676433, "learning_rate": 0.0001, "loss": 1.5742, "loss/crossentropy": 2.6523520946502686, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21093079447746277, "step": 3289 }, { "epoch": 0.04912684132328896, "grad_norm": 0.3515625, "grad_norm_var": 0.0012698968251546223, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.3562008142471313, "loss/fcd": 1.17578125, "loss/idx": 9.0, "loss/logits": 0.14043575525283813, "step": 3290 }, { "epoch": 0.04914177349390394, "grad_norm": 0.462890625, "grad_norm_var": 0.0015965779622395834, "learning_rate": 0.0001, "loss": 1.9309, "loss/crossentropy": 2.5745856761932373, "loss/fcd": 1.66015625, "loss/idx": 9.0, "loss/logits": 0.27078838646411896, "step": 3291 }, { "epoch": 0.04915670566451892, "grad_norm": 0.36328125, "grad_norm_var": 0.0015965779622395834, "learning_rate": 0.0001, "loss": 1.4338, "loss/crossentropy": 2.725277304649353, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17988409847021103, "step": 3292 }, { "epoch": 0.049171637835133904, "grad_norm": 0.494140625, "grad_norm_var": 0.002240737279256185, "learning_rate": 0.0001, "loss": 1.6486, "loss/crossentropy": 2.6383020877838135, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.20719582587480545, "step": 3293 }, { "epoch": 0.049186570005748886, "grad_norm": 0.41015625, "grad_norm_var": 0.0020310560862223308, "learning_rate": 0.0001, "loss": 1.6086, "loss/crossentropy": 2.666481852531433, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2375478595495224, "step": 3294 }, { "epoch": 0.04920150217636387, "grad_norm": 0.375, "grad_norm_var": 0.002048603693644206, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.8113726377487183, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1803249791264534, "step": 3295 }, { "epoch": 0.04921643434697885, "grad_norm": 0.376953125, "grad_norm_var": 0.0019256591796875, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.656274437904358, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2065204530954361, "step": 3296 }, { "epoch": 0.04923136651759383, "grad_norm": 0.458984375, "grad_norm_var": 0.002071237564086914, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.577187657356262, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2220742255449295, "step": 3297 }, { "epoch": 0.04924629868820881, "grad_norm": 0.435546875, "grad_norm_var": 0.001980082194010417, "learning_rate": 0.0001, "loss": 1.5435, "loss/crossentropy": 2.681047797203064, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19973412156105042, "step": 3298 }, { "epoch": 0.04926123085882379, "grad_norm": 0.3671875, "grad_norm_var": 0.00189207394917806, "learning_rate": 0.0001, "loss": 1.4818, "loss/crossentropy": 2.625427484512329, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18492358922958374, "step": 3299 }, { "epoch": 0.049276163029438776, "grad_norm": 0.39453125, "grad_norm_var": 0.001810455322265625, "learning_rate": 0.0001, "loss": 1.7014, "loss/crossentropy": 2.5090383291244507, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2599739283323288, "step": 3300 }, { "epoch": 0.04929109520005376, "grad_norm": 0.3984375, "grad_norm_var": 0.001808023452758789, "learning_rate": 0.0001, "loss": 1.6017, "loss/crossentropy": 2.5013610124588013, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.19934818148612976, "step": 3301 }, { "epoch": 0.049306027370668734, "grad_norm": 0.76953125, "grad_norm_var": 0.009845479329427084, "learning_rate": 0.0001, "loss": 1.8071, "loss/crossentropy": 2.7497187852859497, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.24457994103431702, "step": 3302 }, { "epoch": 0.04932095954128372, "grad_norm": 0.50390625, "grad_norm_var": 0.010152117411295573, "learning_rate": 0.0001, "loss": 1.6748, "loss/crossentropy": 2.6025500297546387, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.2216930240392685, "step": 3303 }, { "epoch": 0.0493358917118987, "grad_norm": 0.48046875, "grad_norm_var": 0.010039710998535156, "learning_rate": 0.0001, "loss": 1.7614, "loss/crossentropy": 2.6149940490722656, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.26922978460788727, "step": 3304 }, { "epoch": 0.04935082388251368, "grad_norm": 0.416015625, "grad_norm_var": 0.010039710998535156, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.598615288734436, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1850811243057251, "step": 3305 }, { "epoch": 0.049365756053128665, "grad_norm": 0.423828125, "grad_norm_var": 0.009502776463826497, "learning_rate": 0.0001, "loss": 1.4814, "loss/crossentropy": 2.665157198905945, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1923278570175171, "step": 3306 }, { "epoch": 0.04938068822374365, "grad_norm": 0.44921875, "grad_norm_var": 0.009483083089192709, "learning_rate": 0.0001, "loss": 1.8142, "loss/crossentropy": 2.5078773498535156, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.25169386714696884, "step": 3307 }, { "epoch": 0.04939562039435862, "grad_norm": 0.435546875, "grad_norm_var": 0.009023777643839518, "learning_rate": 0.0001, "loss": 1.5457, "loss/crossentropy": 2.7746816873550415, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19408879429101944, "step": 3308 }, { "epoch": 0.049410552564973606, "grad_norm": 0.369140625, "grad_norm_var": 0.009253676732381184, "learning_rate": 0.0001, "loss": 1.594, "loss/crossentropy": 2.6566224098205566, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2189541757106781, "step": 3309 }, { "epoch": 0.04942548473558859, "grad_norm": 0.427734375, "grad_norm_var": 0.00919946034749349, "learning_rate": 0.0001, "loss": 1.7968, "loss/crossentropy": 2.327500581741333, "loss/fcd": 1.55078125, "loss/idx": 9.0, "loss/logits": 0.24599139392375946, "step": 3310 }, { "epoch": 0.04944041690620357, "grad_norm": 0.439453125, "grad_norm_var": 0.008877929051717122, "learning_rate": 0.0001, "loss": 1.5227, "loss/crossentropy": 2.647882580757141, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19849887490272522, "step": 3311 }, { "epoch": 0.049455349076818554, "grad_norm": 0.4453125, "grad_norm_var": 0.008534685770670573, "learning_rate": 0.0001, "loss": 1.4741, "loss/crossentropy": 2.7565746307373047, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.17335714399814606, "step": 3312 }, { "epoch": 0.04947028124743353, "grad_norm": 0.404296875, "grad_norm_var": 0.008662859598795572, "learning_rate": 0.0001, "loss": 1.5332, "loss/crossentropy": 2.5478798151016235, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18949925154447556, "step": 3313 }, { "epoch": 0.04948521341804851, "grad_norm": 0.451171875, "grad_norm_var": 0.008653195699055989, "learning_rate": 0.0001, "loss": 1.5932, "loss/crossentropy": 2.787394881248474, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2025272101163864, "step": 3314 }, { "epoch": 0.049500145588663495, "grad_norm": 0.34765625, "grad_norm_var": 0.008888753255208333, "learning_rate": 0.0001, "loss": 1.4796, "loss/crossentropy": 2.5598262548446655, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19051914662122726, "step": 3315 }, { "epoch": 0.04951507775927848, "grad_norm": 0.94921875, "grad_norm_var": 0.02421849568684896, "learning_rate": 0.0001, "loss": 1.8787, "loss/crossentropy": 2.949846386909485, "loss/fcd": 1.6171875, "loss/idx": 9.0, "loss/logits": 0.26146290451288223, "step": 3316 }, { "epoch": 0.04953000992989346, "grad_norm": 0.42578125, "grad_norm_var": 0.02396081288655599, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.7375699281692505, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19560128450393677, "step": 3317 }, { "epoch": 0.04954494210050844, "grad_norm": 0.4140625, "grad_norm_var": 0.018308258056640624, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.7210699319839478, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18463025987148285, "step": 3318 }, { "epoch": 0.04955987427112342, "grad_norm": 0.435546875, "grad_norm_var": 0.0182131290435791, "learning_rate": 0.0001, "loss": 1.5519, "loss/crossentropy": 2.441086530685425, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2042396366596222, "step": 3319 }, { "epoch": 0.0495748064417384, "grad_norm": 0.37890625, "grad_norm_var": 0.018542083104451497, "learning_rate": 0.0001, "loss": 1.4739, "loss/crossentropy": 2.4841556549072266, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.17306936532258987, "step": 3320 }, { "epoch": 0.049589738612353385, "grad_norm": 0.3671875, "grad_norm_var": 0.018917592366536458, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.652363657951355, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.19194991141557693, "step": 3321 }, { "epoch": 0.04960467078296837, "grad_norm": 0.3515625, "grad_norm_var": 0.019474522272745768, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.4555782079696655, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.20442169159650803, "step": 3322 }, { "epoch": 0.04961960295358335, "grad_norm": 0.369140625, "grad_norm_var": 0.019811439514160156, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.5199557542800903, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18788188695907593, "step": 3323 }, { "epoch": 0.049634535124198326, "grad_norm": 0.423828125, "grad_norm_var": 0.01982421875, "learning_rate": 0.0001, "loss": 1.6431, "loss/crossentropy": 2.555139183998108, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22120477259159088, "step": 3324 }, { "epoch": 0.04964946729481331, "grad_norm": 0.439453125, "grad_norm_var": 0.019492340087890626, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.579594612121582, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.17081964015960693, "step": 3325 }, { "epoch": 0.04966439946542829, "grad_norm": 0.41015625, "grad_norm_var": 0.01954483985900879, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.5052610635757446, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.15974117815494537, "step": 3326 }, { "epoch": 0.049679331636043274, "grad_norm": 0.419921875, "grad_norm_var": 0.01957217852274577, "learning_rate": 0.0001, "loss": 1.5313, "loss/crossentropy": 2.423062324523926, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19145497679710388, "step": 3327 }, { "epoch": 0.049694263806658256, "grad_norm": 0.373046875, "grad_norm_var": 0.019843292236328126, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.562502861022949, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19811377674341202, "step": 3328 }, { "epoch": 0.04970919597727324, "grad_norm": 0.337890625, "grad_norm_var": 0.020391273498535156, "learning_rate": 0.0001, "loss": 1.5534, "loss/crossentropy": 2.4664840698242188, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20181329548358917, "step": 3329 }, { "epoch": 0.049724128147888215, "grad_norm": 0.4765625, "grad_norm_var": 0.02050016721089681, "learning_rate": 0.0001, "loss": 1.5808, "loss/crossentropy": 2.5257933139801025, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.221375472843647, "step": 3330 }, { "epoch": 0.0497390603185032, "grad_norm": 0.40234375, "grad_norm_var": 0.02006847063700358, "learning_rate": 0.0001, "loss": 1.559, "loss/crossentropy": 2.4953675270080566, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19962020218372345, "step": 3331 }, { "epoch": 0.04975399248911818, "grad_norm": 0.44921875, "grad_norm_var": 0.0014730930328369141, "learning_rate": 0.0001, "loss": 1.7474, "loss/crossentropy": 2.646385431289673, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.23961831629276276, "step": 3332 }, { "epoch": 0.04976892465973316, "grad_norm": 0.359375, "grad_norm_var": 0.0015617211659749348, "learning_rate": 0.0001, "loss": 1.5911, "loss/crossentropy": 2.5425256490707397, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.22777054458856583, "step": 3333 }, { "epoch": 0.049783856830348146, "grad_norm": 0.390625, "grad_norm_var": 0.0015537103017171225, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.6559170484542847, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.2018807977437973, "step": 3334 }, { "epoch": 0.04979878900096313, "grad_norm": 0.404296875, "grad_norm_var": 0.0014626661936442058, "learning_rate": 0.0001, "loss": 1.5789, "loss/crossentropy": 2.762932062149048, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21168388426303864, "step": 3335 }, { "epoch": 0.049813721171578104, "grad_norm": 0.36328125, "grad_norm_var": 0.001515817642211914, "learning_rate": 0.0001, "loss": 1.5536, "loss/crossentropy": 2.5987144708633423, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19418419897556305, "step": 3336 }, { "epoch": 0.04982865334219309, "grad_norm": 0.37109375, "grad_norm_var": 0.0015017032623291016, "learning_rate": 0.0001, "loss": 1.5561, "loss/crossentropy": 2.6873228549957275, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19670676440000534, "step": 3337 }, { "epoch": 0.04984358551280807, "grad_norm": 0.376953125, "grad_norm_var": 0.0013903299967447916, "learning_rate": 0.0001, "loss": 1.6303, "loss/crossentropy": 2.614723801612854, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22794761508703232, "step": 3338 }, { "epoch": 0.04985851768342305, "grad_norm": 0.3828125, "grad_norm_var": 0.001349496841430664, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.6617860794067383, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18448014557361603, "step": 3339 }, { "epoch": 0.049873449854038035, "grad_norm": 0.423828125, "grad_norm_var": 0.001349496841430664, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.7421622276306152, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2111818790435791, "step": 3340 }, { "epoch": 0.04988838202465301, "grad_norm": 0.412109375, "grad_norm_var": 0.0012480258941650391, "learning_rate": 0.0001, "loss": 1.6159, "loss/crossentropy": 2.5724247694015503, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21746758371591568, "step": 3341 }, { "epoch": 0.04990331419526799, "grad_norm": 0.40234375, "grad_norm_var": 0.001238234837849935, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.4962246417999268, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.21452497690916061, "step": 3342 }, { "epoch": 0.049918246365882976, "grad_norm": 0.337890625, "grad_norm_var": 0.00140379269917806, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.746044635772705, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17885075509548187, "step": 3343 }, { "epoch": 0.04993317853649796, "grad_norm": 0.484375, "grad_norm_var": 0.0019048055013020833, "learning_rate": 0.0001, "loss": 1.6695, "loss/crossentropy": 2.455736756324768, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2046310380101204, "step": 3344 }, { "epoch": 0.04994811070711294, "grad_norm": 0.75, "grad_norm_var": 0.009192514419555663, "learning_rate": 0.0001, "loss": 1.9782, "loss/crossentropy": 2.7098904848098755, "loss/fcd": 1.6484375, "loss/idx": 9.0, "loss/logits": 0.3297480344772339, "step": 3345 }, { "epoch": 0.049963042877727924, "grad_norm": 0.4375, "grad_norm_var": 0.009015130996704101, "learning_rate": 0.0001, "loss": 1.6738, "loss/crossentropy": 2.578580141067505, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.21289971470832825, "step": 3346 }, { "epoch": 0.0499779750483429, "grad_norm": 0.408203125, "grad_norm_var": 0.009002113342285156, "learning_rate": 0.0001, "loss": 1.4479, "loss/crossentropy": 2.595977544784546, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.1705770641565323, "step": 3347 }, { "epoch": 0.04999290721895788, "grad_norm": 0.412109375, "grad_norm_var": 0.008954095840454101, "learning_rate": 0.0001, "loss": 1.6182, "loss/crossentropy": 2.667595386505127, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.20417819917201996, "step": 3348 }, { "epoch": 0.050007839389572865, "grad_norm": 0.49609375, "grad_norm_var": 0.009020853042602538, "learning_rate": 0.0001, "loss": 1.7428, "loss/crossentropy": 2.517558455467224, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.21154146641492844, "step": 3349 }, { "epoch": 0.05002277156018785, "grad_norm": 0.4375, "grad_norm_var": 0.008922433853149414, "learning_rate": 0.0001, "loss": 1.7399, "loss/crossentropy": 2.605816602706909, "loss/fcd": 1.51953125, "loss/idx": 9.0, "loss/logits": 0.22034142911434174, "step": 3350 }, { "epoch": 0.05003770373080283, "grad_norm": 0.384765625, "grad_norm_var": 0.00901652971903483, "learning_rate": 0.0001, "loss": 1.5239, "loss/crossentropy": 3.0169183015823364, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1996908187866211, "step": 3351 }, { "epoch": 0.05005263590141781, "grad_norm": 0.396484375, "grad_norm_var": 0.008789825439453124, "learning_rate": 0.0001, "loss": 1.5949, "loss/crossentropy": 2.725011944770813, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20428059250116348, "step": 3352 }, { "epoch": 0.05006756807203279, "grad_norm": 0.43359375, "grad_norm_var": 0.008525339762369792, "learning_rate": 0.0001, "loss": 1.7444, "loss/crossentropy": 2.445889949798584, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.23268957436084747, "step": 3353 }, { "epoch": 0.05008250024264777, "grad_norm": 0.396484375, "grad_norm_var": 0.00839532216389974, "learning_rate": 0.0001, "loss": 1.5786, "loss/crossentropy": 2.570001482963562, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.1918914020061493, "step": 3354 }, { "epoch": 0.050097432413262755, "grad_norm": 0.5078125, "grad_norm_var": 0.008464495340983072, "learning_rate": 0.0001, "loss": 1.8255, "loss/crossentropy": 2.240598499774933, "loss/fcd": 1.59765625, "loss/idx": 9.0, "loss/logits": 0.22786815464496613, "step": 3355 }, { "epoch": 0.05011236458387774, "grad_norm": 0.34375, "grad_norm_var": 0.009092060724894206, "learning_rate": 0.0001, "loss": 1.4612, "loss/crossentropy": 2.449530601501465, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18389814347028732, "step": 3356 }, { "epoch": 0.05012729675449272, "grad_norm": 0.35546875, "grad_norm_var": 0.009503682454427084, "learning_rate": 0.0001, "loss": 1.4085, "loss/crossentropy": 2.641911506652832, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.15854422748088837, "step": 3357 }, { "epoch": 0.050142228925107696, "grad_norm": 0.361328125, "grad_norm_var": 0.009795745213826498, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.510370969772339, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17331445962190628, "step": 3358 }, { "epoch": 0.05015716109572268, "grad_norm": 0.470703125, "grad_norm_var": 0.009196964899698894, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.7504172325134277, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20661664009094238, "step": 3359 }, { "epoch": 0.05017209326633766, "grad_norm": 0.37890625, "grad_norm_var": 0.009299961725870769, "learning_rate": 0.0001, "loss": 1.5866, "loss/crossentropy": 2.5236071348190308, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.19598190486431122, "step": 3360 }, { "epoch": 0.050187025436952644, "grad_norm": 0.416015625, "grad_norm_var": 0.002274004618326823, "learning_rate": 0.0001, "loss": 1.5358, "loss/crossentropy": 2.5238473415374756, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18817290663719177, "step": 3361 }, { "epoch": 0.05020195760756763, "grad_norm": 0.373046875, "grad_norm_var": 0.002338520685831706, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.684301972389221, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18264199048280716, "step": 3362 }, { "epoch": 0.0502168897781826, "grad_norm": 0.453125, "grad_norm_var": 0.002449289957682292, "learning_rate": 0.0001, "loss": 1.6482, "loss/crossentropy": 2.5686800479888916, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.21848054230213165, "step": 3363 }, { "epoch": 0.050231821948797585, "grad_norm": 0.376953125, "grad_norm_var": 0.002533404032389323, "learning_rate": 0.0001, "loss": 1.5666, "loss/crossentropy": 2.623072862625122, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.21896276623010635, "step": 3364 }, { "epoch": 0.05024675411941257, "grad_norm": 0.3984375, "grad_norm_var": 0.0020263671875, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.509666681289673, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1550777554512024, "step": 3365 }, { "epoch": 0.05026168629002755, "grad_norm": 0.349609375, "grad_norm_var": 0.0021315097808837892, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.7210460901260376, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.16374506056308746, "step": 3366 }, { "epoch": 0.05027661846064253, "grad_norm": 0.36328125, "grad_norm_var": 0.002203369140625, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.6781809329986572, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1705775409936905, "step": 3367 }, { "epoch": 0.050291550631257516, "grad_norm": 0.400390625, "grad_norm_var": 0.0022033055623372397, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.694146752357483, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1679014414548874, "step": 3368 }, { "epoch": 0.05030648280187249, "grad_norm": 0.3515625, "grad_norm_var": 0.0022420247395833334, "learning_rate": 0.0001, "loss": 1.5028, "loss/crossentropy": 2.4220833778381348, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18643079698085785, "step": 3369 }, { "epoch": 0.050321414972487474, "grad_norm": 0.333984375, "grad_norm_var": 0.0024617513020833335, "learning_rate": 0.0001, "loss": 1.452, "loss/crossentropy": 2.4777084589004517, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.18243641406297684, "step": 3370 }, { "epoch": 0.05033634714310246, "grad_norm": 0.38671875, "grad_norm_var": 0.0014703750610351562, "learning_rate": 0.0001, "loss": 1.6722, "loss/crossentropy": 2.7902181148529053, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.23467420786619186, "step": 3371 }, { "epoch": 0.05035127931371744, "grad_norm": 0.416015625, "grad_norm_var": 0.0014274438222249349, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.5146236419677734, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1903812661767006, "step": 3372 }, { "epoch": 0.05036621148433242, "grad_norm": 0.4921875, "grad_norm_var": 0.0020282586415608724, "learning_rate": 0.0001, "loss": 1.5383, "loss/crossentropy": 2.683732748031616, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20240622758865356, "step": 3373 }, { "epoch": 0.0503811436549474, "grad_norm": 0.380859375, "grad_norm_var": 0.0019640445709228514, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.49574077129364, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17782658338546753, "step": 3374 }, { "epoch": 0.05039607582556238, "grad_norm": 0.55859375, "grad_norm_var": 0.003318023681640625, "learning_rate": 0.0001, "loss": 1.7838, "loss/crossentropy": 2.586862802505493, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.21737906336784363, "step": 3375 }, { "epoch": 0.050411007996177364, "grad_norm": 0.369140625, "grad_norm_var": 0.00335386594136556, "learning_rate": 0.0001, "loss": 1.5969, "loss/crossentropy": 2.481982707977295, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21015559881925583, "step": 3376 }, { "epoch": 0.050425940166792346, "grad_norm": 0.431640625, "grad_norm_var": 0.0033998966217041017, "learning_rate": 0.0001, "loss": 1.5209, "loss/crossentropy": 2.6880468130111694, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19664078950881958, "step": 3377 }, { "epoch": 0.05044087233740733, "grad_norm": 0.37109375, "grad_norm_var": 0.003407732645670573, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.605507731437683, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.1629970669746399, "step": 3378 }, { "epoch": 0.05045580450802231, "grad_norm": 0.427734375, "grad_norm_var": 0.0032752831776936848, "learning_rate": 0.0001, "loss": 1.6809, "loss/crossentropy": 2.4899561405181885, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.20819447934627533, "step": 3379 }, { "epoch": 0.05047073667863729, "grad_norm": 0.408203125, "grad_norm_var": 0.0032381534576416014, "learning_rate": 0.0001, "loss": 1.569, "loss/crossentropy": 2.5086870193481445, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19788877665996552, "step": 3380 }, { "epoch": 0.05048566884925227, "grad_norm": 0.35546875, "grad_norm_var": 0.003376626968383789, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.7047486305236816, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.15870673954486847, "step": 3381 }, { "epoch": 0.05050060101986725, "grad_norm": 0.380859375, "grad_norm_var": 0.003228616714477539, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.752522826194763, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1960720270872116, "step": 3382 }, { "epoch": 0.050515533190482236, "grad_norm": 0.373046875, "grad_norm_var": 0.00318450927734375, "learning_rate": 0.0001, "loss": 1.5522, "loss/crossentropy": 2.6028571128845215, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.2006167694926262, "step": 3383 }, { "epoch": 0.05053046536109722, "grad_norm": 0.369140625, "grad_norm_var": 0.003253682454427083, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.2878557443618774, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.23554906249046326, "step": 3384 }, { "epoch": 0.050545397531712194, "grad_norm": 0.44140625, "grad_norm_var": 0.0031732559204101563, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.5902469158172607, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1540529876947403, "step": 3385 }, { "epoch": 0.05056032970232718, "grad_norm": 0.361328125, "grad_norm_var": 0.0029574076334635416, "learning_rate": 0.0001, "loss": 1.4681, "loss/crossentropy": 2.6663898229599, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.17908070981502533, "step": 3386 }, { "epoch": 0.05057526187294216, "grad_norm": 0.373046875, "grad_norm_var": 0.003007364273071289, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.3662787675857544, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19888584315776825, "step": 3387 }, { "epoch": 0.05059019404355714, "grad_norm": 0.34375, "grad_norm_var": 0.00324554443359375, "learning_rate": 0.0001, "loss": 1.4884, "loss/crossentropy": 2.496182084083557, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17977027595043182, "step": 3388 }, { "epoch": 0.050605126214172125, "grad_norm": 1.3125, "grad_norm_var": 0.05512924194335937, "learning_rate": 0.0001, "loss": 1.528, "loss/crossentropy": 2.593655586242676, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.21940980106592178, "step": 3389 }, { "epoch": 0.05062005838478711, "grad_norm": 0.3984375, "grad_norm_var": 0.054978036880493165, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.4393880367279053, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19844051450490952, "step": 3390 }, { "epoch": 0.05063499055540208, "grad_norm": 0.447265625, "grad_norm_var": 0.054210662841796875, "learning_rate": 0.0001, "loss": 1.6871, "loss/crossentropy": 2.63421368598938, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.21444348245859146, "step": 3391 }, { "epoch": 0.050649922726017066, "grad_norm": 0.6953125, "grad_norm_var": 0.057441059748331705, "learning_rate": 0.0001, "loss": 1.6719, "loss/crossentropy": 2.11980402469635, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.17187748104333878, "step": 3392 }, { "epoch": 0.05066485489663205, "grad_norm": 0.396484375, "grad_norm_var": 0.057689396540323894, "learning_rate": 0.0001, "loss": 1.5462, "loss/crossentropy": 2.68084716796875, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.20241329818964005, "step": 3393 }, { "epoch": 0.05067978706724703, "grad_norm": 0.462890625, "grad_norm_var": 0.057055155436197914, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.653223752975464, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1960497722029686, "step": 3394 }, { "epoch": 0.050694719237862014, "grad_norm": 0.404296875, "grad_norm_var": 0.05722681681315104, "learning_rate": 0.0001, "loss": 1.6403, "loss/crossentropy": 2.4543696641921997, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.19893023371696472, "step": 3395 }, { "epoch": 0.050709651408477, "grad_norm": 0.404296875, "grad_norm_var": 0.05726006825764974, "learning_rate": 0.0001, "loss": 1.5581, "loss/crossentropy": 2.465823173522949, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20267478376626968, "step": 3396 }, { "epoch": 0.05072458357909197, "grad_norm": 0.451171875, "grad_norm_var": 0.056371418635050456, "learning_rate": 0.0001, "loss": 1.6782, "loss/crossentropy": 2.4386913776397705, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2367667257785797, "step": 3397 }, { "epoch": 0.050739515749706955, "grad_norm": 0.427734375, "grad_norm_var": 0.055914417902628584, "learning_rate": 0.0001, "loss": 1.5384, "loss/crossentropy": 2.5622812509536743, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.21029071509838104, "step": 3398 }, { "epoch": 0.05075444792032194, "grad_norm": 0.447265625, "grad_norm_var": 0.05521136919657389, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.8046610355377197, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.16422411799430847, "step": 3399 }, { "epoch": 0.05076938009093692, "grad_norm": 0.384765625, "grad_norm_var": 0.05498833656311035, "learning_rate": 0.0001, "loss": 1.4189, "loss/crossentropy": 2.6880656480789185, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1649605631828308, "step": 3400 }, { "epoch": 0.0507843122615519, "grad_norm": 0.36328125, "grad_norm_var": 0.055818669001261395, "learning_rate": 0.0001, "loss": 1.4676, "loss/crossentropy": 2.67470121383667, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18248559534549713, "step": 3401 }, { "epoch": 0.05079924443216688, "grad_norm": 0.404296875, "grad_norm_var": 0.05525638262430827, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.756223201751709, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.16707990318536758, "step": 3402 }, { "epoch": 0.05081417660278186, "grad_norm": 0.375, "grad_norm_var": 0.055228169759114584, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.3414889574050903, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.205796979367733, "step": 3403 }, { "epoch": 0.050829108773396844, "grad_norm": 0.357421875, "grad_norm_var": 0.054987064997355145, "learning_rate": 0.0001, "loss": 1.4696, "loss/crossentropy": 2.6183600425720215, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18440843373537064, "step": 3404 }, { "epoch": 0.05084404094401183, "grad_norm": 0.6015625, "grad_norm_var": 0.007973082860310872, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.695538878440857, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19736036658287048, "step": 3405 }, { "epoch": 0.05085897311462681, "grad_norm": 0.32421875, "grad_norm_var": 0.008717203140258789, "learning_rate": 0.0001, "loss": 1.481, "loss/crossentropy": 2.411239266395569, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1919359639286995, "step": 3406 }, { "epoch": 0.05087390528524179, "grad_norm": 0.3671875, "grad_norm_var": 0.008978525797526041, "learning_rate": 0.0001, "loss": 1.5273, "loss/crossentropy": 2.405904531478882, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18750402331352234, "step": 3407 }, { "epoch": 0.05088883745585677, "grad_norm": 0.412109375, "grad_norm_var": 0.003942728042602539, "learning_rate": 0.0001, "loss": 1.6677, "loss/crossentropy": 2.397928476333618, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.21850630640983582, "step": 3408 }, { "epoch": 0.05090376962647175, "grad_norm": 0.36328125, "grad_norm_var": 0.004078102111816406, "learning_rate": 0.0001, "loss": 1.7345, "loss/crossentropy": 2.5721205472946167, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.24617432802915573, "step": 3409 }, { "epoch": 0.050918701797086734, "grad_norm": 0.365234375, "grad_norm_var": 0.00397796630859375, "learning_rate": 0.0001, "loss": 1.5374, "loss/crossentropy": 2.4987505674362183, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.2053566798567772, "step": 3410 }, { "epoch": 0.050933633967701716, "grad_norm": 0.380859375, "grad_norm_var": 0.004009246826171875, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.6026971340179443, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.22141285985708237, "step": 3411 }, { "epoch": 0.0509485661383167, "grad_norm": 0.41015625, "grad_norm_var": 0.0040132999420166016, "learning_rate": 0.0001, "loss": 1.7294, "loss/crossentropy": 2.5589200258255005, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.23333770036697388, "step": 3412 }, { "epoch": 0.050963498308931675, "grad_norm": 0.365234375, "grad_norm_var": 0.003913990656534831, "learning_rate": 0.0001, "loss": 1.558, "loss/crossentropy": 2.5953595638275146, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.21429059654474258, "step": 3413 }, { "epoch": 0.05097843047954666, "grad_norm": 0.447265625, "grad_norm_var": 0.004018259048461914, "learning_rate": 0.0001, "loss": 1.5594, "loss/crossentropy": 2.6100367307662964, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1883210986852646, "step": 3414 }, { "epoch": 0.05099336265016164, "grad_norm": 0.38671875, "grad_norm_var": 0.003850237528483073, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.6781861782073975, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.18708272278308868, "step": 3415 }, { "epoch": 0.05100829482077662, "grad_norm": 0.52734375, "grad_norm_var": 0.004939762751261393, "learning_rate": 0.0001, "loss": 1.9011, "loss/crossentropy": 2.371121883392334, "loss/fcd": 1.63671875, "loss/idx": 9.0, "loss/logits": 0.26440124958753586, "step": 3416 }, { "epoch": 0.051023226991391606, "grad_norm": 0.56640625, "grad_norm_var": 0.006437412897745768, "learning_rate": 0.0001, "loss": 1.8096, "loss/crossentropy": 2.75041401386261, "loss/fcd": 1.5625, "loss/idx": 9.0, "loss/logits": 0.24710142612457275, "step": 3417 }, { "epoch": 0.05103815916200659, "grad_norm": 0.40234375, "grad_norm_var": 0.00644067128499349, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.5977176427841187, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16348198056221008, "step": 3418 }, { "epoch": 0.051053091332621564, "grad_norm": 0.458984375, "grad_norm_var": 0.006424951553344727, "learning_rate": 0.0001, "loss": 1.6778, "loss/crossentropy": 2.5855696201324463, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.19344550371170044, "step": 3419 }, { "epoch": 0.05106802350323655, "grad_norm": 0.390625, "grad_norm_var": 0.006212298075358073, "learning_rate": 0.0001, "loss": 1.5875, "loss/crossentropy": 2.5247955322265625, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20472384244203568, "step": 3420 }, { "epoch": 0.05108295567385153, "grad_norm": 0.41015625, "grad_norm_var": 0.00394744873046875, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.312396764755249, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.22341037541627884, "step": 3421 }, { "epoch": 0.05109788784446651, "grad_norm": 0.353515625, "grad_norm_var": 0.0036615848541259764, "learning_rate": 0.0001, "loss": 1.5335, "loss/crossentropy": 2.5657237768173218, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19367000460624695, "step": 3422 }, { "epoch": 0.051112820015081495, "grad_norm": 0.392578125, "grad_norm_var": 0.003546905517578125, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.504173994064331, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21192172169685364, "step": 3423 }, { "epoch": 0.05112775218569647, "grad_norm": 0.357421875, "grad_norm_var": 0.0037516276041666668, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.8428211212158203, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.17316972464323044, "step": 3424 }, { "epoch": 0.05114268435631145, "grad_norm": 0.41015625, "grad_norm_var": 0.003589884440104167, "learning_rate": 0.0001, "loss": 1.5236, "loss/crossentropy": 2.6477075815200806, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19156558066606522, "step": 3425 }, { "epoch": 0.051157616526926436, "grad_norm": 0.455078125, "grad_norm_var": 0.0035094579060872396, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.4965333938598633, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21275152266025543, "step": 3426 }, { "epoch": 0.05117254869754142, "grad_norm": 0.375, "grad_norm_var": 0.003541930516560872, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.6202927827835083, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.21650148183107376, "step": 3427 }, { "epoch": 0.0511874808681564, "grad_norm": 0.412109375, "grad_norm_var": 0.003539784749348958, "learning_rate": 0.0001, "loss": 1.5856, "loss/crossentropy": 2.5812458992004395, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.20668591558933258, "step": 3428 }, { "epoch": 0.051202413038771384, "grad_norm": 0.408203125, "grad_norm_var": 0.0033446629842122395, "learning_rate": 0.0001, "loss": 1.6392, "loss/crossentropy": 2.4704002141952515, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.21730166673660278, "step": 3429 }, { "epoch": 0.05121734520938636, "grad_norm": 0.6328125, "grad_norm_var": 0.006118504206339518, "learning_rate": 0.0001, "loss": 1.6571, "loss/crossentropy": 2.423509120941162, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.20393849909305573, "step": 3430 }, { "epoch": 0.05123227738000134, "grad_norm": 0.451171875, "grad_norm_var": 0.0059742609659830725, "learning_rate": 0.0001, "loss": 1.4917, "loss/crossentropy": 2.7623183727264404, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17922090739011765, "step": 3431 }, { "epoch": 0.051247209550616325, "grad_norm": 0.39453125, "grad_norm_var": 0.00549004872639974, "learning_rate": 0.0001, "loss": 1.4026, "loss/crossentropy": 2.335055112838745, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.15651090443134308, "step": 3432 }, { "epoch": 0.05126214172123131, "grad_norm": 0.44921875, "grad_norm_var": 0.00420831044514974, "learning_rate": 0.0001, "loss": 1.7515, "loss/crossentropy": 2.594982624053955, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.2437201887369156, "step": 3433 }, { "epoch": 0.05127707389184629, "grad_norm": 0.384765625, "grad_norm_var": 0.00427397092183431, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.6292624473571777, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20748884975910187, "step": 3434 }, { "epoch": 0.051292006062461266, "grad_norm": 0.3671875, "grad_norm_var": 0.004335975646972657, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.5977706909179688, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1730734333395958, "step": 3435 }, { "epoch": 0.05130693823307625, "grad_norm": 0.380859375, "grad_norm_var": 0.0043740431467692055, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.5793784856796265, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19296550750732422, "step": 3436 }, { "epoch": 0.05132187040369123, "grad_norm": 0.369140625, "grad_norm_var": 0.004503885904947917, "learning_rate": 0.0001, "loss": 1.45, "loss/crossentropy": 2.78774893283844, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.18047862499952316, "step": 3437 }, { "epoch": 0.051336802574306215, "grad_norm": 0.400390625, "grad_norm_var": 0.004275004069010417, "learning_rate": 0.0001, "loss": 1.4711, "loss/crossentropy": 2.29997456073761, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17423991858959198, "step": 3438 }, { "epoch": 0.0513517347449212, "grad_norm": 0.388671875, "grad_norm_var": 0.00428765614827474, "learning_rate": 0.0001, "loss": 1.6485, "loss/crossentropy": 2.647382140159607, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22661471366882324, "step": 3439 }, { "epoch": 0.05136666691553618, "grad_norm": 0.419921875, "grad_norm_var": 0.004053688049316407, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.4566662311553955, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19122587889432907, "step": 3440 }, { "epoch": 0.051381599086151156, "grad_norm": 0.3515625, "grad_norm_var": 0.00433502197265625, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.5999128818511963, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19267628341913223, "step": 3441 }, { "epoch": 0.05139653125676614, "grad_norm": 0.3828125, "grad_norm_var": 0.004275623957316081, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.5817168951034546, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1880125254392624, "step": 3442 }, { "epoch": 0.05141146342738112, "grad_norm": 0.408203125, "grad_norm_var": 0.004187266031901042, "learning_rate": 0.0001, "loss": 1.6034, "loss/crossentropy": 2.454679489135742, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.21280641853809357, "step": 3443 }, { "epoch": 0.051426395597996104, "grad_norm": 0.380859375, "grad_norm_var": 0.004250335693359375, "learning_rate": 0.0001, "loss": 1.7513, "loss/crossentropy": 2.7751805782318115, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.25915253162384033, "step": 3444 }, { "epoch": 0.051441327768611086, "grad_norm": 0.47265625, "grad_norm_var": 0.004488992691040039, "learning_rate": 0.0001, "loss": 1.7909, "loss/crossentropy": 2.7293131351470947, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.2557414323091507, "step": 3445 }, { "epoch": 0.05145625993922606, "grad_norm": 0.349609375, "grad_norm_var": 0.0012646993001302084, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.640157103538513, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17565272748470306, "step": 3446 }, { "epoch": 0.051471192109841045, "grad_norm": 0.41796875, "grad_norm_var": 0.0010936578114827474, "learning_rate": 0.0001, "loss": 1.6399, "loss/crossentropy": 2.5622098445892334, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.21021165698766708, "step": 3447 }, { "epoch": 0.05148612428045603, "grad_norm": 0.40625, "grad_norm_var": 0.00110166867574056, "learning_rate": 0.0001, "loss": 1.5431, "loss/crossentropy": 2.193228840827942, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19546884298324585, "step": 3448 }, { "epoch": 0.05150105645107101, "grad_norm": 0.427734375, "grad_norm_var": 0.000977007548014323, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.644696593284607, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.2114245891571045, "step": 3449 }, { "epoch": 0.05151598862168599, "grad_norm": 0.404296875, "grad_norm_var": 0.0009760538736979167, "learning_rate": 0.0001, "loss": 1.3803, "loss/crossentropy": 2.6519217491149902, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.15371173620224, "step": 3450 }, { "epoch": 0.051530920792300976, "grad_norm": 0.365234375, "grad_norm_var": 0.0009836673736572266, "learning_rate": 0.0001, "loss": 1.4363, "loss/crossentropy": 2.7089725732803345, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17460782825946808, "step": 3451 }, { "epoch": 0.05154585296291595, "grad_norm": 0.3671875, "grad_norm_var": 0.0010218302408854167, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.6210272312164307, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.2095654532313347, "step": 3452 }, { "epoch": 0.051560785133530934, "grad_norm": 0.41015625, "grad_norm_var": 0.0009881178538004556, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.4631502628326416, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.17772500216960907, "step": 3453 }, { "epoch": 0.05157571730414592, "grad_norm": 0.38671875, "grad_norm_var": 0.0009937922159830729, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.627729058265686, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20623022317886353, "step": 3454 }, { "epoch": 0.0515906494747609, "grad_norm": 0.34765625, "grad_norm_var": 0.0011403242746988933, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.6381596326828003, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18692880868911743, "step": 3455 }, { "epoch": 0.05160558164537588, "grad_norm": 0.412109375, "grad_norm_var": 0.001116800308227539, "learning_rate": 0.0001, "loss": 1.6073, "loss/crossentropy": 2.146046817302704, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.17367637157440186, "step": 3456 }, { "epoch": 0.051620513815990865, "grad_norm": 0.375, "grad_norm_var": 0.0010210514068603516, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.723568916320801, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.1897224560379982, "step": 3457 }, { "epoch": 0.05163544598660584, "grad_norm": 0.4453125, "grad_norm_var": 0.001166518529256185, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.3714317083358765, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.19132807105779648, "step": 3458 }, { "epoch": 0.05165037815722082, "grad_norm": 0.50390625, "grad_norm_var": 0.001862017313639323, "learning_rate": 0.0001, "loss": 2.0309, "loss/crossentropy": 2.6780686378479004, "loss/fcd": 1.7421875, "loss/idx": 9.0, "loss/logits": 0.2887064889073372, "step": 3459 }, { "epoch": 0.051665310327835806, "grad_norm": 0.390625, "grad_norm_var": 0.00183714230855306, "learning_rate": 0.0001, "loss": 1.6414, "loss/crossentropy": 2.948399782180786, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.23910070210695267, "step": 3460 }, { "epoch": 0.05168024249845079, "grad_norm": 0.337890625, "grad_norm_var": 0.0017592748006184896, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.6767112016677856, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.18295221775770187, "step": 3461 }, { "epoch": 0.05169517466906577, "grad_norm": 0.3671875, "grad_norm_var": 0.0016681512196858725, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.643781900405884, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17384568601846695, "step": 3462 }, { "epoch": 0.05171010683968075, "grad_norm": 0.3828125, "grad_norm_var": 0.00165098508199056, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.7534897327423096, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18334726989269257, "step": 3463 }, { "epoch": 0.05172503901029573, "grad_norm": 0.3828125, "grad_norm_var": 0.0016521294911702475, "learning_rate": 0.0001, "loss": 1.5812, "loss/crossentropy": 2.878259778022766, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20620445907115936, "step": 3464 }, { "epoch": 0.05173997118091071, "grad_norm": 0.43359375, "grad_norm_var": 0.0016805013020833334, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.5496610403060913, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19086632877588272, "step": 3465 }, { "epoch": 0.051754903351525695, "grad_norm": 0.40625, "grad_norm_var": 0.0016832828521728515, "learning_rate": 0.0001, "loss": 1.6015, "loss/crossentropy": 2.679402232170105, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.20308122783899307, "step": 3466 }, { "epoch": 0.05176983552214068, "grad_norm": 0.369140625, "grad_norm_var": 0.0016689141591389974, "learning_rate": 0.0001, "loss": 1.5471, "loss/crossentropy": 2.5767383575439453, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1876918375492096, "step": 3467 }, { "epoch": 0.05178476769275566, "grad_norm": 0.37109375, "grad_norm_var": 0.001655435562133789, "learning_rate": 0.0001, "loss": 1.4647, "loss/crossentropy": 2.738846778869629, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.17568714916706085, "step": 3468 }, { "epoch": 0.05179969986337064, "grad_norm": 0.353515625, "grad_norm_var": 0.0017425537109375, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.5080716609954834, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1661972850561142, "step": 3469 }, { "epoch": 0.05181463203398562, "grad_norm": 0.416015625, "grad_norm_var": 0.0017771244049072266, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.6081093549728394, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19481997191905975, "step": 3470 }, { "epoch": 0.0518295642046006, "grad_norm": 1.0234375, "grad_norm_var": 0.026195001602172852, "learning_rate": 0.0001, "loss": 1.9078, "loss/crossentropy": 2.6458232402801514, "loss/fcd": 1.58203125, "loss/idx": 9.0, "loss/logits": 0.3257727175951004, "step": 3471 }, { "epoch": 0.051844496375215585, "grad_norm": 0.41796875, "grad_norm_var": 0.026178741455078126, "learning_rate": 0.0001, "loss": 1.6227, "loss/crossentropy": 2.7039071321487427, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22039306908845901, "step": 3472 }, { "epoch": 0.05185942854583057, "grad_norm": 0.4296875, "grad_norm_var": 0.025920613606770834, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.712033748626709, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18417691439390182, "step": 3473 }, { "epoch": 0.05187436071644554, "grad_norm": 0.41015625, "grad_norm_var": 0.02597039540608724, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.6599977016448975, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18268226832151413, "step": 3474 }, { "epoch": 0.051889292887060526, "grad_norm": 0.35546875, "grad_norm_var": 0.02602837880452474, "learning_rate": 0.0001, "loss": 1.567, "loss/crossentropy": 2.2886472940444946, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.2115393504500389, "step": 3475 }, { "epoch": 0.05190422505767551, "grad_norm": 0.375, "grad_norm_var": 0.02612145741780599, "learning_rate": 0.0001, "loss": 1.4551, "loss/crossentropy": 2.725509762763977, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1738094687461853, "step": 3476 }, { "epoch": 0.05191915722829049, "grad_norm": 0.3515625, "grad_norm_var": 0.0259706974029541, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.5487940311431885, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.2257729023694992, "step": 3477 }, { "epoch": 0.051934089398905474, "grad_norm": 0.435546875, "grad_norm_var": 0.025709788004557293, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 2.5278842449188232, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19207017868757248, "step": 3478 }, { "epoch": 0.05194902156952046, "grad_norm": 0.427734375, "grad_norm_var": 0.025540526707967123, "learning_rate": 0.0001, "loss": 1.6194, "loss/crossentropy": 2.5650179386138916, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.21707092970609665, "step": 3479 }, { "epoch": 0.05196395374013543, "grad_norm": 0.5703125, "grad_norm_var": 0.026434691747029622, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.5376354455947876, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.19616550207138062, "step": 3480 }, { "epoch": 0.051978885910750415, "grad_norm": 0.380859375, "grad_norm_var": 0.026700337727864582, "learning_rate": 0.0001, "loss": 1.4297, "loss/crossentropy": 2.5226409435272217, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17580606043338776, "step": 3481 }, { "epoch": 0.0519938180813654, "grad_norm": 0.42578125, "grad_norm_var": 0.026627540588378906, "learning_rate": 0.0001, "loss": 1.512, "loss/crossentropy": 2.4601471424102783, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.17996357381343842, "step": 3482 }, { "epoch": 0.05200875025198038, "grad_norm": 0.37890625, "grad_norm_var": 0.0265352725982666, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.657753348350525, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.211819589138031, "step": 3483 }, { "epoch": 0.05202368242259536, "grad_norm": 0.416015625, "grad_norm_var": 0.02621758778889974, "learning_rate": 0.0001, "loss": 1.5153, "loss/crossentropy": 2.699800133705139, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1950150579214096, "step": 3484 }, { "epoch": 0.05203861459321034, "grad_norm": 0.369140625, "grad_norm_var": 0.02603600819905599, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.5671803951263428, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19939013570547104, "step": 3485 }, { "epoch": 0.05205354676382532, "grad_norm": 0.33984375, "grad_norm_var": 0.02673338254292806, "learning_rate": 0.0001, "loss": 1.4586, "loss/crossentropy": 2.6328150033950806, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17730768769979477, "step": 3486 }, { "epoch": 0.052068478934440304, "grad_norm": 0.4609375, "grad_norm_var": 0.0030670007069905597, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.4848986864089966, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.23702863603830338, "step": 3487 }, { "epoch": 0.05208341110505529, "grad_norm": 0.361328125, "grad_norm_var": 0.0032002131144205728, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.3707001209259033, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2013426274061203, "step": 3488 }, { "epoch": 0.05209834327567027, "grad_norm": 0.375, "grad_norm_var": 0.003210894266764323, "learning_rate": 0.0001, "loss": 1.5761, "loss/crossentropy": 2.5230042934417725, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20109081268310547, "step": 3489 }, { "epoch": 0.05211327544628525, "grad_norm": 0.451171875, "grad_norm_var": 0.0033600966135660807, "learning_rate": 0.0001, "loss": 1.7306, "loss/crossentropy": 2.5210148096084595, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.2501562312245369, "step": 3490 }, { "epoch": 0.05212820761690023, "grad_norm": 0.36328125, "grad_norm_var": 0.00331266721089681, "learning_rate": 0.0001, "loss": 1.5123, "loss/crossentropy": 2.4743778705596924, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18421833217144012, "step": 3491 }, { "epoch": 0.05214313978751521, "grad_norm": 0.451171875, "grad_norm_var": 0.0033690770467122394, "learning_rate": 0.0001, "loss": 1.556, "loss/crossentropy": 2.5974942445755005, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19269028306007385, "step": 3492 }, { "epoch": 0.052158071958130194, "grad_norm": 0.365234375, "grad_norm_var": 0.003274393081665039, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.296021580696106, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.15924721956253052, "step": 3493 }, { "epoch": 0.052173004128745176, "grad_norm": 0.3828125, "grad_norm_var": 0.0032739639282226562, "learning_rate": 0.0001, "loss": 1.6222, "loss/crossentropy": 2.5853958129882812, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.21988408267498016, "step": 3494 }, { "epoch": 0.05218793629936016, "grad_norm": 0.421875, "grad_norm_var": 0.0032602787017822266, "learning_rate": 0.0001, "loss": 1.6439, "loss/crossentropy": 2.67590069770813, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2141997516155243, "step": 3495 }, { "epoch": 0.052202868469975135, "grad_norm": 0.431640625, "grad_norm_var": 0.0014444986979166667, "learning_rate": 0.0001, "loss": 1.6663, "loss/crossentropy": 2.566882848739624, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.228822760283947, "step": 3496 }, { "epoch": 0.05221780064059012, "grad_norm": 0.4140625, "grad_norm_var": 0.0014355818430582682, "learning_rate": 0.0001, "loss": 1.6052, "loss/crossentropy": 2.5256869792938232, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2145943120121956, "step": 3497 }, { "epoch": 0.0522327328112051, "grad_norm": 0.388671875, "grad_norm_var": 0.001396624247233073, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.726364016532898, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2147740125656128, "step": 3498 }, { "epoch": 0.05224766498182008, "grad_norm": 0.4296875, "grad_norm_var": 0.0014272054036458333, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.507602572441101, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18111993372440338, "step": 3499 }, { "epoch": 0.052262597152435065, "grad_norm": 0.345703125, "grad_norm_var": 0.0015988667805989583, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.636507987976074, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16639221459627151, "step": 3500 }, { "epoch": 0.05227752932305005, "grad_norm": 0.349609375, "grad_norm_var": 0.0016951878865559896, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.5759679079055786, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17723525315523148, "step": 3501 }, { "epoch": 0.052292461493665024, "grad_norm": 0.482421875, "grad_norm_var": 0.0019028822580973307, "learning_rate": 0.0001, "loss": 1.7319, "loss/crossentropy": 2.697183847427368, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.22798918932676315, "step": 3502 }, { "epoch": 0.05230739366428001, "grad_norm": 0.359375, "grad_norm_var": 0.0017855167388916016, "learning_rate": 0.0001, "loss": 1.4288, "loss/crossentropy": 2.5916539430618286, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16708090901374817, "step": 3503 }, { "epoch": 0.05232232583489499, "grad_norm": 0.412109375, "grad_norm_var": 0.0016962528228759766, "learning_rate": 0.0001, "loss": 1.6241, "loss/crossentropy": 2.725490927696228, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22171706706285477, "step": 3504 }, { "epoch": 0.05233725800550997, "grad_norm": 0.35546875, "grad_norm_var": 0.001789077123006185, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.733468770980835, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.17536749690771103, "step": 3505 }, { "epoch": 0.052352190176124955, "grad_norm": 0.33203125, "grad_norm_var": 0.0018676122029622395, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.2334933280944824, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1904454156756401, "step": 3506 }, { "epoch": 0.05236712234673993, "grad_norm": 0.36328125, "grad_norm_var": 0.0018676122029622395, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.825354814529419, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1781739667057991, "step": 3507 }, { "epoch": 0.05238205451735491, "grad_norm": 0.33984375, "grad_norm_var": 0.00177610715230306, "learning_rate": 0.0001, "loss": 1.4742, "loss/crossentropy": 2.6760584115982056, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1812199428677559, "step": 3508 }, { "epoch": 0.052396986687969896, "grad_norm": 0.43359375, "grad_norm_var": 0.0018801371256510417, "learning_rate": 0.0001, "loss": 2.211, "loss/crossentropy": 2.374463438987732, "loss/fcd": 1.84765625, "loss/idx": 9.0, "loss/logits": 0.36336249113082886, "step": 3509 }, { "epoch": 0.05241191885858488, "grad_norm": 0.478515625, "grad_norm_var": 0.0023591200510660808, "learning_rate": 0.0001, "loss": 1.6818, "loss/crossentropy": 2.4249950647354126, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2403586581349373, "step": 3510 }, { "epoch": 0.05242685102919986, "grad_norm": 0.359375, "grad_norm_var": 0.002388620376586914, "learning_rate": 0.0001, "loss": 1.4018, "loss/crossentropy": 2.436414122581482, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16742635518312454, "step": 3511 }, { "epoch": 0.052441783199814844, "grad_norm": 0.404296875, "grad_norm_var": 0.002291599909464518, "learning_rate": 0.0001, "loss": 1.8662, "loss/crossentropy": 2.5082212686538696, "loss/fcd": 1.55078125, "loss/idx": 9.0, "loss/logits": 0.3154417723417282, "step": 3512 }, { "epoch": 0.05245671537042982, "grad_norm": 0.33203125, "grad_norm_var": 0.0024544874827067058, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.564389705657959, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16873634606599808, "step": 3513 }, { "epoch": 0.0524716475410448, "grad_norm": 0.392578125, "grad_norm_var": 0.0024571577707926434, "learning_rate": 0.0001, "loss": 1.5198, "loss/crossentropy": 2.652384877204895, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.17996994405984879, "step": 3514 }, { "epoch": 0.052486579711659785, "grad_norm": 0.400390625, "grad_norm_var": 0.002338663736979167, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.6056991815567017, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.17947600781917572, "step": 3515 }, { "epoch": 0.05250151188227477, "grad_norm": 0.43359375, "grad_norm_var": 0.002375141779581706, "learning_rate": 0.0001, "loss": 1.5405, "loss/crossentropy": 2.7993791103363037, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18893015384674072, "step": 3516 }, { "epoch": 0.05251644405288975, "grad_norm": 0.396484375, "grad_norm_var": 0.002264515558878581, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.4804948568344116, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19646970927715302, "step": 3517 }, { "epoch": 0.052531376223504726, "grad_norm": 0.384765625, "grad_norm_var": 0.001685953140258789, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.424618721008301, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.22030997276306152, "step": 3518 }, { "epoch": 0.05254630839411971, "grad_norm": 0.3671875, "grad_norm_var": 0.0016619205474853516, "learning_rate": 0.0001, "loss": 1.5811, "loss/crossentropy": 2.45977246761322, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.19825930148363113, "step": 3519 }, { "epoch": 0.05256124056473469, "grad_norm": 0.421875, "grad_norm_var": 0.0017011006673177083, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.6631182432174683, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.23101946711540222, "step": 3520 }, { "epoch": 0.052576172735349674, "grad_norm": 0.41796875, "grad_norm_var": 0.001680755615234375, "learning_rate": 0.0001, "loss": 1.3942, "loss/crossentropy": 2.74634325504303, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.171591117978096, "step": 3521 }, { "epoch": 0.05259110490596466, "grad_norm": 0.486328125, "grad_norm_var": 0.001953236262003581, "learning_rate": 0.0001, "loss": 1.7331, "loss/crossentropy": 2.266065239906311, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.20578981190919876, "step": 3522 }, { "epoch": 0.05260603707657964, "grad_norm": 0.380859375, "grad_norm_var": 0.0018847147623697917, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.704484224319458, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17549577355384827, "step": 3523 }, { "epoch": 0.052620969247194616, "grad_norm": 0.443359375, "grad_norm_var": 0.0016985416412353515, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.5847750902175903, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19453522562980652, "step": 3524 }, { "epoch": 0.0526359014178096, "grad_norm": 0.40234375, "grad_norm_var": 0.0016542911529541016, "learning_rate": 0.0001, "loss": 1.4976, "loss/crossentropy": 2.342091917991638, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18513450771570206, "step": 3525 }, { "epoch": 0.05265083358842458, "grad_norm": 0.494140625, "grad_norm_var": 0.0018198490142822266, "learning_rate": 0.0001, "loss": 1.692, "loss/crossentropy": 2.714049220085144, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.21938493102788925, "step": 3526 }, { "epoch": 0.052665765759039564, "grad_norm": 0.396484375, "grad_norm_var": 0.001668548583984375, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.598415493965149, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.2042800486087799, "step": 3527 }, { "epoch": 0.052680697929654546, "grad_norm": 0.3984375, "grad_norm_var": 0.0016748905181884766, "learning_rate": 0.0001, "loss": 1.5065, "loss/crossentropy": 2.763436436653137, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19009945541620255, "step": 3528 }, { "epoch": 0.05269563010026953, "grad_norm": 0.43359375, "grad_norm_var": 0.001273202896118164, "learning_rate": 0.0001, "loss": 1.7061, "loss/crossentropy": 2.4988582134246826, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.22568102180957794, "step": 3529 }, { "epoch": 0.052710562270884505, "grad_norm": 0.369140625, "grad_norm_var": 0.0013796329498291016, "learning_rate": 0.0001, "loss": 1.5831, "loss/crossentropy": 2.5627505779266357, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.18861418962478638, "step": 3530 }, { "epoch": 0.05272549444149949, "grad_norm": 0.435546875, "grad_norm_var": 0.001392221450805664, "learning_rate": 0.0001, "loss": 1.5516, "loss/crossentropy": 2.556346297264099, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19223041832447052, "step": 3531 }, { "epoch": 0.05274042661211447, "grad_norm": 0.376953125, "grad_norm_var": 0.0014627456665039062, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.809193968772888, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18517035990953445, "step": 3532 }, { "epoch": 0.05275535878272945, "grad_norm": 0.361328125, "grad_norm_var": 0.001616668701171875, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.4351552724838257, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.16037864238023758, "step": 3533 }, { "epoch": 0.052770290953344436, "grad_norm": 0.44140625, "grad_norm_var": 0.0016217390696207682, "learning_rate": 0.0001, "loss": 1.6642, "loss/crossentropy": 2.975360870361328, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.2384132742881775, "step": 3534 }, { "epoch": 0.05278522312395941, "grad_norm": 0.3984375, "grad_norm_var": 0.001486953099568685, "learning_rate": 0.0001, "loss": 1.7331, "loss/crossentropy": 2.581705689430237, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.2447703778743744, "step": 3535 }, { "epoch": 0.052800155294574394, "grad_norm": 0.375, "grad_norm_var": 0.0015884240468343098, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.697448492050171, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17317106574773788, "step": 3536 }, { "epoch": 0.05281508746518938, "grad_norm": 0.400390625, "grad_norm_var": 0.0015965779622395834, "learning_rate": 0.0001, "loss": 1.4813, "loss/crossentropy": 2.7004029750823975, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1961718201637268, "step": 3537 }, { "epoch": 0.05283001963580436, "grad_norm": 0.3828125, "grad_norm_var": 0.001241922378540039, "learning_rate": 0.0001, "loss": 1.5278, "loss/crossentropy": 2.5407464504241943, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18406888842582703, "step": 3538 }, { "epoch": 0.05284495180641934, "grad_norm": 0.40625, "grad_norm_var": 0.0011983235677083333, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 2.603774666786194, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.1965571492910385, "step": 3539 }, { "epoch": 0.052859883977034325, "grad_norm": 0.3671875, "grad_norm_var": 0.001193984349568685, "learning_rate": 0.0001, "loss": 1.4561, "loss/crossentropy": 2.7813162803649902, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17487117648124695, "step": 3540 }, { "epoch": 0.0528748161476493, "grad_norm": 0.462890625, "grad_norm_var": 0.001422119140625, "learning_rate": 0.0001, "loss": 1.5712, "loss/crossentropy": 2.8230735063552856, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2078821361064911, "step": 3541 }, { "epoch": 0.05288974831826428, "grad_norm": 0.390625, "grad_norm_var": 0.0008787631988525391, "learning_rate": 0.0001, "loss": 1.4084, "loss/crossentropy": 2.705218553543091, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.17405175417661667, "step": 3542 }, { "epoch": 0.052904680488879266, "grad_norm": 0.37890625, "grad_norm_var": 0.0009057998657226562, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.735429525375366, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1901097148656845, "step": 3543 }, { "epoch": 0.05291961265949425, "grad_norm": 0.427734375, "grad_norm_var": 0.0009584903717041015, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.5994791984558105, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18320050835609436, "step": 3544 }, { "epoch": 0.05293454483010923, "grad_norm": 0.37109375, "grad_norm_var": 0.0009269555409749349, "learning_rate": 0.0001, "loss": 1.3816, "loss/crossentropy": 2.927733063697815, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.1588970273733139, "step": 3545 }, { "epoch": 0.05294947700072421, "grad_norm": 0.359375, "grad_norm_var": 0.0009686787923177083, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.529976010322571, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19225668162107468, "step": 3546 }, { "epoch": 0.05296440917133919, "grad_norm": 0.55078125, "grad_norm_var": 0.00240629514058431, "learning_rate": 0.0001, "loss": 1.6828, "loss/crossentropy": 2.5190505981445312, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.22184914350509644, "step": 3547 }, { "epoch": 0.05297934134195417, "grad_norm": 0.41015625, "grad_norm_var": 0.0023590087890625, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.4725492000579834, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20193304121494293, "step": 3548 }, { "epoch": 0.052994273512569155, "grad_norm": 0.373046875, "grad_norm_var": 0.002298927307128906, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.445379137992859, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18032163381576538, "step": 3549 }, { "epoch": 0.05300920568318414, "grad_norm": 0.361328125, "grad_norm_var": 0.002321736017862956, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.6729835271835327, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1835547238588333, "step": 3550 }, { "epoch": 0.05302413785379912, "grad_norm": 0.375, "grad_norm_var": 0.002364079157511393, "learning_rate": 0.0001, "loss": 1.5653, "loss/crossentropy": 2.4613808393478394, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20984559506177902, "step": 3551 }, { "epoch": 0.053039070024414096, "grad_norm": 0.326171875, "grad_norm_var": 0.0026728312174479165, "learning_rate": 0.0001, "loss": 1.4895, "loss/crossentropy": 2.4635735750198364, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18477663397789001, "step": 3552 }, { "epoch": 0.05305400219502908, "grad_norm": 0.3671875, "grad_norm_var": 0.002724440892537435, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.610003709793091, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.18298635631799698, "step": 3553 }, { "epoch": 0.05306893436564406, "grad_norm": 0.435546875, "grad_norm_var": 0.0028167088826497396, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.676245093345642, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.22482185065746307, "step": 3554 }, { "epoch": 0.053083866536259044, "grad_norm": 0.4140625, "grad_norm_var": 0.002829424540201823, "learning_rate": 0.0001, "loss": 1.747, "loss/crossentropy": 2.4803667068481445, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.2392278015613556, "step": 3555 }, { "epoch": 0.05309879870687403, "grad_norm": 0.4296875, "grad_norm_var": 0.0028151830037434896, "learning_rate": 0.0001, "loss": 1.6526, "loss/crossentropy": 2.805676579475403, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.20340877771377563, "step": 3556 }, { "epoch": 0.053113730877489, "grad_norm": 0.458984375, "grad_norm_var": 0.002784474690755208, "learning_rate": 0.0001, "loss": 1.5722, "loss/crossentropy": 2.42125928401947, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2089235484600067, "step": 3557 }, { "epoch": 0.053128663048103986, "grad_norm": 0.423828125, "grad_norm_var": 0.002803659439086914, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.543775200843811, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1917477622628212, "step": 3558 }, { "epoch": 0.05314359521871897, "grad_norm": 0.380859375, "grad_norm_var": 0.002797381083170573, "learning_rate": 0.0001, "loss": 1.4273, "loss/crossentropy": 2.7690653800964355, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.1772596687078476, "step": 3559 }, { "epoch": 0.05315852738933395, "grad_norm": 0.359375, "grad_norm_var": 0.0028735955556233725, "learning_rate": 0.0001, "loss": 1.3958, "loss/crossentropy": 2.5764238834381104, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16143980622291565, "step": 3560 }, { "epoch": 0.053173459559948934, "grad_norm": 0.4140625, "grad_norm_var": 0.0028246402740478515, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.6259433031082153, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19373437762260437, "step": 3561 }, { "epoch": 0.053188391730563916, "grad_norm": 0.453125, "grad_norm_var": 0.0028353214263916017, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 2.7407346963882446, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18817999213933945, "step": 3562 }, { "epoch": 0.05320332390117889, "grad_norm": 0.37890625, "grad_norm_var": 0.0014170169830322265, "learning_rate": 0.0001, "loss": 1.5102, "loss/crossentropy": 2.5407673120498657, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19379990547895432, "step": 3563 }, { "epoch": 0.053218256071793875, "grad_norm": 0.400390625, "grad_norm_var": 0.0014066060384114584, "learning_rate": 0.0001, "loss": 1.4498, "loss/crossentropy": 2.63564932346344, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.1724376529455185, "step": 3564 }, { "epoch": 0.05323318824240886, "grad_norm": 0.4375, "grad_norm_var": 0.0014606316884358723, "learning_rate": 0.0001, "loss": 1.5564, "loss/crossentropy": 2.546329140663147, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20093993097543716, "step": 3565 }, { "epoch": 0.05324812041302384, "grad_norm": 0.36328125, "grad_norm_var": 0.0014505386352539062, "learning_rate": 0.0001, "loss": 1.542, "loss/crossentropy": 2.5702253580093384, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19825299829244614, "step": 3566 }, { "epoch": 0.05326305258363882, "grad_norm": 0.388671875, "grad_norm_var": 0.0014146010080973307, "learning_rate": 0.0001, "loss": 1.6444, "loss/crossentropy": 2.5880179405212402, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2225026786327362, "step": 3567 }, { "epoch": 0.0532779847542538, "grad_norm": 0.39453125, "grad_norm_var": 0.0010157267252604166, "learning_rate": 0.0001, "loss": 1.8081, "loss/crossentropy": 2.5737674236297607, "loss/fcd": 1.546875, "loss/idx": 9.0, "loss/logits": 0.2612714543938637, "step": 3568 }, { "epoch": 0.05329291692486878, "grad_norm": 0.44921875, "grad_norm_var": 0.0010090510050455729, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.675176739692688, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1895691454410553, "step": 3569 }, { "epoch": 0.053307849095483764, "grad_norm": 0.416015625, "grad_norm_var": 0.0009699503580729167, "learning_rate": 0.0001, "loss": 1.693, "loss/crossentropy": 2.5987719297409058, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.2398580014705658, "step": 3570 }, { "epoch": 0.05332278126609875, "grad_norm": 0.423828125, "grad_norm_var": 0.0009809970855712891, "learning_rate": 0.0001, "loss": 1.4856, "loss/crossentropy": 2.5883471965789795, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18094515800476074, "step": 3571 }, { "epoch": 0.05333771343671373, "grad_norm": 0.361328125, "grad_norm_var": 0.0011006037394205729, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.615593671798706, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16788973659276962, "step": 3572 }, { "epoch": 0.05335264560732871, "grad_norm": 0.373046875, "grad_norm_var": 0.0009607315063476562, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 2.5805312395095825, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1722501516342163, "step": 3573 }, { "epoch": 0.05336757777794369, "grad_norm": 0.361328125, "grad_norm_var": 0.0010156631469726562, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.6539281606674194, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.15958449989557266, "step": 3574 }, { "epoch": 0.05338250994855867, "grad_norm": 0.4140625, "grad_norm_var": 0.0010121504465738932, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.386804938316345, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.20323237031698227, "step": 3575 }, { "epoch": 0.05339744211917365, "grad_norm": 0.337890625, "grad_norm_var": 0.0011553446451822917, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.6973042488098145, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.16991064697504044, "step": 3576 }, { "epoch": 0.053412374289788636, "grad_norm": 0.375, "grad_norm_var": 0.0011667887369791667, "learning_rate": 0.0001, "loss": 1.5718, "loss/crossentropy": 2.6565704345703125, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.21238093078136444, "step": 3577 }, { "epoch": 0.05342730646040362, "grad_norm": 0.357421875, "grad_norm_var": 0.0010040124257405599, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.475495457649231, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17696359753608704, "step": 3578 }, { "epoch": 0.053442238631018595, "grad_norm": 0.390625, "grad_norm_var": 0.0009960015614827475, "learning_rate": 0.0001, "loss": 1.4347, "loss/crossentropy": 2.5263818502426147, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17301492393016815, "step": 3579 }, { "epoch": 0.05345717080163358, "grad_norm": 0.384765625, "grad_norm_var": 0.000990152359008789, "learning_rate": 0.0001, "loss": 1.5828, "loss/crossentropy": 2.4607930183410645, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.20389610528945923, "step": 3580 }, { "epoch": 0.05347210297224856, "grad_norm": 0.443359375, "grad_norm_var": 0.00102996826171875, "learning_rate": 0.0001, "loss": 1.7667, "loss/crossentropy": 2.75494384765625, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.23936307430267334, "step": 3581 }, { "epoch": 0.05348703514286354, "grad_norm": 0.341796875, "grad_norm_var": 0.001134347915649414, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.4165592193603516, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1821940466761589, "step": 3582 }, { "epoch": 0.053501967313478525, "grad_norm": 0.373046875, "grad_norm_var": 0.0011488437652587891, "learning_rate": 0.0001, "loss": 1.6239, "loss/crossentropy": 2.6448994874954224, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.20590806752443314, "step": 3583 }, { "epoch": 0.05351689948409351, "grad_norm": 0.3671875, "grad_norm_var": 0.0011693159739176431, "learning_rate": 0.0001, "loss": 1.4763, "loss/crossentropy": 2.5700855255126953, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17940231412649155, "step": 3584 }, { "epoch": 0.053531831654708484, "grad_norm": 0.37109375, "grad_norm_var": 0.0008882999420166016, "learning_rate": 0.0001, "loss": 1.5083, "loss/crossentropy": 2.7515445947647095, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19188088178634644, "step": 3585 }, { "epoch": 0.053546763825323467, "grad_norm": 0.396484375, "grad_norm_var": 0.0008202711741129557, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.561580538749695, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18677616864442825, "step": 3586 }, { "epoch": 0.05356169599593845, "grad_norm": 0.400390625, "grad_norm_var": 0.0007161299387613932, "learning_rate": 0.0001, "loss": 1.5094, "loss/crossentropy": 2.5624606609344482, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.1734333336353302, "step": 3587 }, { "epoch": 0.05357662816655343, "grad_norm": 0.412109375, "grad_norm_var": 0.0007640679677327473, "learning_rate": 0.0001, "loss": 1.6617, "loss/crossentropy": 2.546544313430786, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.2046288102865219, "step": 3588 }, { "epoch": 0.053591560337168415, "grad_norm": 0.37109375, "grad_norm_var": 0.0007664362589518229, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.6395353078842163, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17122472077608109, "step": 3589 }, { "epoch": 0.0536064925077834, "grad_norm": 0.421875, "grad_norm_var": 0.0008359114329020182, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.5208293199539185, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19267799705266953, "step": 3590 }, { "epoch": 0.05362142467839837, "grad_norm": 0.3828125, "grad_norm_var": 0.0007753849029541015, "learning_rate": 0.0001, "loss": 1.607, "loss/crossentropy": 2.610782742500305, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.21639873832464218, "step": 3591 }, { "epoch": 0.053636356849013356, "grad_norm": 0.33984375, "grad_norm_var": 0.0007638931274414062, "learning_rate": 0.0001, "loss": 1.5636, "loss/crossentropy": 2.431633949279785, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19638380408287048, "step": 3592 }, { "epoch": 0.05365128901962834, "grad_norm": 0.349609375, "grad_norm_var": 0.0008314609527587891, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.746755003929138, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.190554179251194, "step": 3593 }, { "epoch": 0.05366622119024332, "grad_norm": 0.35546875, "grad_norm_var": 0.0008379618326822916, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.5412946939468384, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.17801547050476074, "step": 3594 }, { "epoch": 0.053681153360858304, "grad_norm": 0.62890625, "grad_norm_var": 0.00468133290608724, "learning_rate": 0.0001, "loss": 2.0374, "loss/crossentropy": 2.3553117513656616, "loss/fcd": 1.7421875, "loss/idx": 9.0, "loss/logits": 0.2951717674732208, "step": 3595 }, { "epoch": 0.05369608553147328, "grad_norm": 0.375, "grad_norm_var": 0.004702234268188476, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.5891562700271606, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19261066615581512, "step": 3596 }, { "epoch": 0.05371101770208826, "grad_norm": 0.482421875, "grad_norm_var": 0.005046192804972331, "learning_rate": 0.0001, "loss": 1.6686, "loss/crossentropy": 2.652516722679138, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.211555115878582, "step": 3597 }, { "epoch": 0.053725949872703245, "grad_norm": 0.357421875, "grad_norm_var": 0.004944213231404622, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.7268803119659424, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19417937099933624, "step": 3598 }, { "epoch": 0.05374088204331823, "grad_norm": 0.3828125, "grad_norm_var": 0.0049163182576497395, "learning_rate": 0.0001, "loss": 1.3973, "loss/crossentropy": 2.5268092155456543, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16292723268270493, "step": 3599 }, { "epoch": 0.05375581421393321, "grad_norm": 0.4375, "grad_norm_var": 0.00492089589436849, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.8985859155654907, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19499944150447845, "step": 3600 }, { "epoch": 0.05377074638454819, "grad_norm": 0.412109375, "grad_norm_var": 0.0048457940419514975, "learning_rate": 0.0001, "loss": 1.6099, "loss/crossentropy": 2.564196825027466, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.23491153120994568, "step": 3601 }, { "epoch": 0.05378567855516317, "grad_norm": 0.5234375, "grad_norm_var": 0.005681610107421875, "learning_rate": 0.0001, "loss": 1.7184, "loss/crossentropy": 2.430151104927063, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.24572831392288208, "step": 3602 }, { "epoch": 0.05380061072577815, "grad_norm": 0.39453125, "grad_norm_var": 0.005694818496704101, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.2480610013008118, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.17137347161769867, "step": 3603 }, { "epoch": 0.053815542896393134, "grad_norm": 1.0703125, "grad_norm_var": 0.032589658101399736, "learning_rate": 0.0001, "loss": 1.9416, "loss/crossentropy": 2.6608755588531494, "loss/fcd": 1.60546875, "loss/idx": 9.0, "loss/logits": 0.33614783734083176, "step": 3604 }, { "epoch": 0.05383047506700812, "grad_norm": 0.37890625, "grad_norm_var": 0.032505734761555986, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.695743203163147, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17685801535844803, "step": 3605 }, { "epoch": 0.0538454072376231, "grad_norm": 0.455078125, "grad_norm_var": 0.032424402236938474, "learning_rate": 0.0001, "loss": 1.5912, "loss/crossentropy": 2.909826636314392, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20842208713293076, "step": 3606 }, { "epoch": 0.053860339408238075, "grad_norm": 0.384765625, "grad_norm_var": 0.03240509033203125, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.832564353942871, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1986265629529953, "step": 3607 }, { "epoch": 0.05387527157885306, "grad_norm": 0.41796875, "grad_norm_var": 0.03155568440755208, "learning_rate": 0.0001, "loss": 1.5366, "loss/crossentropy": 2.7854658365249634, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19680358469486237, "step": 3608 }, { "epoch": 0.05389020374946804, "grad_norm": 0.37890625, "grad_norm_var": 0.031166823705037434, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.718261957168579, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.1803724318742752, "step": 3609 }, { "epoch": 0.053905135920083024, "grad_norm": 0.35546875, "grad_norm_var": 0.031166823705037434, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.6998735666275024, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1948431059718132, "step": 3610 }, { "epoch": 0.053920068090698006, "grad_norm": 0.359375, "grad_norm_var": 0.029806884129842122, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.694177269935608, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18184836953878403, "step": 3611 }, { "epoch": 0.05393500026131299, "grad_norm": 0.482421875, "grad_norm_var": 0.029484303792317708, "learning_rate": 0.0001, "loss": 1.9546, "loss/crossentropy": 2.3788228034973145, "loss/fcd": 1.671875, "loss/idx": 9.0, "loss/logits": 0.28275762498378754, "step": 3612 }, { "epoch": 0.053949932431927965, "grad_norm": 0.330078125, "grad_norm_var": 0.03036950429280599, "learning_rate": 0.0001, "loss": 1.3984, "loss/crossentropy": 2.499714732170105, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16398991644382477, "step": 3613 }, { "epoch": 0.05396486460254295, "grad_norm": 0.3984375, "grad_norm_var": 0.029995330174764, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.748724937438965, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.21305356174707413, "step": 3614 }, { "epoch": 0.05397979677315793, "grad_norm": 0.365234375, "grad_norm_var": 0.03016656239827474, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.5347899198532104, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.2142723649740219, "step": 3615 }, { "epoch": 0.05399472894377291, "grad_norm": 0.416015625, "grad_norm_var": 0.030221287409464517, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.4539976119995117, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20659806579351425, "step": 3616 }, { "epoch": 0.054009661114387895, "grad_norm": 0.5078125, "grad_norm_var": 0.030371602376302084, "learning_rate": 0.0001, "loss": 1.606, "loss/crossentropy": 2.6970176696777344, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21924511343240738, "step": 3617 }, { "epoch": 0.05402459328500287, "grad_norm": 0.390625, "grad_norm_var": 0.03019434611002604, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.5961776971817017, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.20492420345544815, "step": 3618 }, { "epoch": 0.054039525455617854, "grad_norm": 0.431640625, "grad_norm_var": 0.030041233698527018, "learning_rate": 0.0001, "loss": 1.5826, "loss/crossentropy": 2.9719862937927246, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.22320950031280518, "step": 3619 }, { "epoch": 0.05405445762623284, "grad_norm": 1.046875, "grad_norm_var": 0.02812205950419108, "learning_rate": 0.0001, "loss": 1.9887, "loss/crossentropy": 3.1768345832824707, "loss/fcd": 1.71484375, "loss/idx": 9.0, "loss/logits": 0.27387573570013046, "step": 3620 }, { "epoch": 0.05406938979684782, "grad_norm": 0.384765625, "grad_norm_var": 0.028073565165201823, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.7388516664505005, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.21038156747817993, "step": 3621 }, { "epoch": 0.0540843219674628, "grad_norm": 0.4453125, "grad_norm_var": 0.028065220514933268, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.5638375282287598, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.205360546708107, "step": 3622 }, { "epoch": 0.054099254138077785, "grad_norm": 0.43359375, "grad_norm_var": 0.02783196767171224, "learning_rate": 0.0001, "loss": 1.6604, "loss/crossentropy": 2.5244650840759277, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2307024598121643, "step": 3623 }, { "epoch": 0.05411418630869276, "grad_norm": 0.359375, "grad_norm_var": 0.02826970418294271, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.5388057231903076, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16450031101703644, "step": 3624 }, { "epoch": 0.05412911847930774, "grad_norm": 0.466796875, "grad_norm_var": 0.02800291379292806, "learning_rate": 0.0001, "loss": 1.6957, "loss/crossentropy": 2.8608860969543457, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.24260436743497849, "step": 3625 }, { "epoch": 0.054144050649922726, "grad_norm": 0.396484375, "grad_norm_var": 0.02760003407796224, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.44475519657135, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.2003495693206787, "step": 3626 }, { "epoch": 0.05415898282053771, "grad_norm": 0.3515625, "grad_norm_var": 0.02769921620686849, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.67139732837677, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.176588773727417, "step": 3627 }, { "epoch": 0.05417391499115269, "grad_norm": 0.404296875, "grad_norm_var": 0.027747535705566408, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.7549006938934326, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.19235248863697052, "step": 3628 }, { "epoch": 0.05418884716176767, "grad_norm": 0.34375, "grad_norm_var": 0.027548710505167644, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.4534703493118286, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18306588381528854, "step": 3629 }, { "epoch": 0.05420377933238265, "grad_norm": 0.359375, "grad_norm_var": 0.027893940607706707, "learning_rate": 0.0001, "loss": 1.5446, "loss/crossentropy": 2.751827359199524, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19696955382823944, "step": 3630 }, { "epoch": 0.05421871150299763, "grad_norm": 0.359375, "grad_norm_var": 0.027957598368326824, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.5347853899002075, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1701226383447647, "step": 3631 }, { "epoch": 0.054233643673612615, "grad_norm": 0.3828125, "grad_norm_var": 0.028148635228474935, "learning_rate": 0.0001, "loss": 1.7756, "loss/crossentropy": 2.402878761291504, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.2482563555240631, "step": 3632 }, { "epoch": 0.0542485758442276, "grad_norm": 0.396484375, "grad_norm_var": 0.027939351399739583, "learning_rate": 0.0001, "loss": 1.6704, "loss/crossentropy": 2.1788190603256226, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.20946920663118362, "step": 3633 }, { "epoch": 0.05426350801484258, "grad_norm": 0.43359375, "grad_norm_var": 0.02780297597249349, "learning_rate": 0.0001, "loss": 1.6294, "loss/crossentropy": 2.4267778396606445, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.19969376176595688, "step": 3634 }, { "epoch": 0.054278440185457556, "grad_norm": 0.353515625, "grad_norm_var": 0.02824293772379557, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.8339085578918457, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.18180087953805923, "step": 3635 }, { "epoch": 0.05429337235607254, "grad_norm": 0.443359375, "grad_norm_var": 0.0015591780344645183, "learning_rate": 0.0001, "loss": 1.603, "loss/crossentropy": 2.685309886932373, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.21241353452205658, "step": 3636 }, { "epoch": 0.05430830452668752, "grad_norm": 0.375, "grad_norm_var": 0.0015780131022135417, "learning_rate": 0.0001, "loss": 1.5816, "loss/crossentropy": 2.6758824586868286, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2066161260008812, "step": 3637 }, { "epoch": 0.054323236697302504, "grad_norm": 0.44140625, "grad_norm_var": 0.001552263895670573, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.3653281927108765, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.18803544342517853, "step": 3638 }, { "epoch": 0.05433816886791749, "grad_norm": 0.40234375, "grad_norm_var": 0.0014474868774414062, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6684165000915527, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.21794097125530243, "step": 3639 }, { "epoch": 0.05435310103853246, "grad_norm": 0.353515625, "grad_norm_var": 0.0014750003814697266, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.6810033321380615, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19980794936418533, "step": 3640 }, { "epoch": 0.054368033209147446, "grad_norm": 0.41796875, "grad_norm_var": 0.0011336644490559896, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.845176935195923, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.2167728692293167, "step": 3641 }, { "epoch": 0.05438296537976243, "grad_norm": 0.451171875, "grad_norm_var": 0.0013793309529622395, "learning_rate": 0.0001, "loss": 1.7066, "loss/crossentropy": 2.5580689907073975, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.24170825630426407, "step": 3642 }, { "epoch": 0.05439789755037741, "grad_norm": 0.37109375, "grad_norm_var": 0.0012982686360677083, "learning_rate": 0.0001, "loss": 1.5211, "loss/crossentropy": 2.577123761177063, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18910322338342667, "step": 3643 }, { "epoch": 0.054412829720992394, "grad_norm": 0.412109375, "grad_norm_var": 0.00131378173828125, "learning_rate": 0.0001, "loss": 1.6247, "loss/crossentropy": 2.4002922773361206, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.20279820263385773, "step": 3644 }, { "epoch": 0.054427761891607376, "grad_norm": 0.4140625, "grad_norm_var": 0.001155853271484375, "learning_rate": 0.0001, "loss": 1.4783, "loss/crossentropy": 2.512631893157959, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18928354233503342, "step": 3645 }, { "epoch": 0.05444269406222235, "grad_norm": 0.439453125, "grad_norm_var": 0.0011447747548421225, "learning_rate": 0.0001, "loss": 1.7278, "loss/crossentropy": 2.5473222732543945, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.22387190163135529, "step": 3646 }, { "epoch": 0.054457626232837335, "grad_norm": 0.38671875, "grad_norm_var": 0.0010326226552327474, "learning_rate": 0.0001, "loss": 1.6228, "loss/crossentropy": 2.766842484474182, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.22434547543525696, "step": 3647 }, { "epoch": 0.05447255840345232, "grad_norm": 0.390625, "grad_norm_var": 0.0010136763254801433, "learning_rate": 0.0001, "loss": 1.6114, "loss/crossentropy": 2.7063294649124146, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.22077026963233948, "step": 3648 }, { "epoch": 0.0544874905740673, "grad_norm": 0.412109375, "grad_norm_var": 0.001010878880818685, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.6318084001541138, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20612207055091858, "step": 3649 }, { "epoch": 0.05450242274468228, "grad_norm": 0.34765625, "grad_norm_var": 0.0011577447255452474, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.5682010650634766, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19679761677980423, "step": 3650 }, { "epoch": 0.054517354915297266, "grad_norm": 0.396484375, "grad_norm_var": 0.00100248654683431, "learning_rate": 0.0001, "loss": 1.7254, "loss/crossentropy": 2.7556906938552856, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.25669705122709274, "step": 3651 }, { "epoch": 0.05453228708591224, "grad_norm": 0.359375, "grad_norm_var": 0.0009963353474934896, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.540266990661621, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1892094984650612, "step": 3652 }, { "epoch": 0.054547219256527224, "grad_norm": 0.455078125, "grad_norm_var": 0.0011494795481363932, "learning_rate": 0.0001, "loss": 1.6484, "loss/crossentropy": 2.4181065559387207, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.19916698336601257, "step": 3653 }, { "epoch": 0.05456215142714221, "grad_norm": 0.369140625, "grad_norm_var": 0.0011077245076497396, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.6972795724868774, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17554078996181488, "step": 3654 }, { "epoch": 0.05457708359775719, "grad_norm": 0.52734375, "grad_norm_var": 0.0021453221638997396, "learning_rate": 0.0001, "loss": 1.5732, "loss/crossentropy": 2.737204670906067, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2060469686985016, "step": 3655 }, { "epoch": 0.05459201576837217, "grad_norm": 0.380859375, "grad_norm_var": 0.0019989013671875, "learning_rate": 0.0001, "loss": 1.5569, "loss/crossentropy": 2.477833390235901, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20531754195690155, "step": 3656 }, { "epoch": 0.05460694793898715, "grad_norm": 0.462890625, "grad_norm_var": 0.00218351682027181, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.425691604614258, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18410807847976685, "step": 3657 }, { "epoch": 0.05462188010960213, "grad_norm": 0.390625, "grad_norm_var": 0.0020884195963541668, "learning_rate": 0.0001, "loss": 1.5251, "loss/crossentropy": 2.5857698917388916, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19695573300123215, "step": 3658 }, { "epoch": 0.05463681228021711, "grad_norm": 0.37109375, "grad_norm_var": 0.0020884195963541668, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.5920339822769165, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18282581865787506, "step": 3659 }, { "epoch": 0.054651744450832096, "grad_norm": 0.40625, "grad_norm_var": 0.002086750666300456, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.8262100219726562, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.18647944927215576, "step": 3660 }, { "epoch": 0.05466667662144708, "grad_norm": 0.546875, "grad_norm_var": 0.0033167362213134765, "learning_rate": 0.0001, "loss": 1.7588, "loss/crossentropy": 2.586805582046509, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.24313172698020935, "step": 3661 }, { "epoch": 0.05468160879206206, "grad_norm": 0.431640625, "grad_norm_var": 0.0032952467600504557, "learning_rate": 0.0001, "loss": 1.6556, "loss/crossentropy": 2.673077344894409, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2180698737502098, "step": 3662 }, { "epoch": 0.05469654096267704, "grad_norm": 0.392578125, "grad_norm_var": 0.0032755533854166665, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.895764946937561, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17695405334234238, "step": 3663 }, { "epoch": 0.05471147313329202, "grad_norm": 0.435546875, "grad_norm_var": 0.003255446751912435, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.743073582649231, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.1860881820321083, "step": 3664 }, { "epoch": 0.054726405303907, "grad_norm": 0.3671875, "grad_norm_var": 0.0034159342447916668, "learning_rate": 0.0001, "loss": 1.5052, "loss/crossentropy": 2.371363043785095, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19269248098134995, "step": 3665 }, { "epoch": 0.054741337474521985, "grad_norm": 0.39453125, "grad_norm_var": 0.003132120768229167, "learning_rate": 0.0001, "loss": 1.4969, "loss/crossentropy": 2.696254014968872, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.1960739940404892, "step": 3666 }, { "epoch": 0.05475626964513697, "grad_norm": 0.49609375, "grad_norm_var": 0.0034669081370035807, "learning_rate": 0.0001, "loss": 1.7719, "loss/crossentropy": 2.405644655227661, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.24067574739456177, "step": 3667 }, { "epoch": 0.054771201815751944, "grad_norm": 0.376953125, "grad_norm_var": 0.003334299723307292, "learning_rate": 0.0001, "loss": 1.5618, "loss/crossentropy": 2.4506553411483765, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19461983442306519, "step": 3668 }, { "epoch": 0.054786133986366926, "grad_norm": 0.4453125, "grad_norm_var": 0.0033014774322509765, "learning_rate": 0.0001, "loss": 1.7559, "loss/crossentropy": 2.376011610031128, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.24029622226953506, "step": 3669 }, { "epoch": 0.05480106615698191, "grad_norm": 0.37890625, "grad_norm_var": 0.003235117594401042, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.3686054944992065, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1950320228934288, "step": 3670 }, { "epoch": 0.05481599832759689, "grad_norm": 0.458984375, "grad_norm_var": 0.00259703000386556, "learning_rate": 0.0001, "loss": 1.5822, "loss/crossentropy": 2.7213776111602783, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2071990817785263, "step": 3671 }, { "epoch": 0.054830930498211874, "grad_norm": 0.35546875, "grad_norm_var": 0.002773284912109375, "learning_rate": 0.0001, "loss": 1.4664, "loss/crossentropy": 2.6397653818130493, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17347301542758942, "step": 3672 }, { "epoch": 0.05484586266882686, "grad_norm": 0.392578125, "grad_norm_var": 0.00267486572265625, "learning_rate": 0.0001, "loss": 1.609, "loss/crossentropy": 2.32483446598053, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.198820598423481, "step": 3673 }, { "epoch": 0.05486079483944183, "grad_norm": 0.392578125, "grad_norm_var": 0.0026687463124593098, "learning_rate": 0.0001, "loss": 1.5518, "loss/crossentropy": 2.57778537273407, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.1885473132133484, "step": 3674 }, { "epoch": 0.054875727010056816, "grad_norm": 0.349609375, "grad_norm_var": 0.0028238296508789062, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.7271443605422974, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17257793247699738, "step": 3675 }, { "epoch": 0.0548906591806718, "grad_norm": 0.404296875, "grad_norm_var": 0.0028260389963785807, "learning_rate": 0.0001, "loss": 1.7589, "loss/crossentropy": 2.6042041778564453, "loss/fcd": 1.5234375, "loss/idx": 9.0, "loss/logits": 0.2354445829987526, "step": 3676 }, { "epoch": 0.05490559135128678, "grad_norm": 0.3515625, "grad_norm_var": 0.0017420291900634766, "learning_rate": 0.0001, "loss": 1.6014, "loss/crossentropy": 2.484697699546814, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.21855255961418152, "step": 3677 }, { "epoch": 0.054920523521901764, "grad_norm": 0.46875, "grad_norm_var": 0.0019772847493489585, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.678922653198242, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19394905120134354, "step": 3678 }, { "epoch": 0.05493545569251674, "grad_norm": 0.369140625, "grad_norm_var": 0.0020467122395833332, "learning_rate": 0.0001, "loss": 1.527, "loss/crossentropy": 2.4439945220947266, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19500792026519775, "step": 3679 }, { "epoch": 0.05495038786313172, "grad_norm": 0.41015625, "grad_norm_var": 0.001974598566691081, "learning_rate": 0.0001, "loss": 1.5529, "loss/crossentropy": 2.5675488710403442, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1935175210237503, "step": 3680 }, { "epoch": 0.054965320033746705, "grad_norm": 0.38671875, "grad_norm_var": 0.001911020278930664, "learning_rate": 0.0001, "loss": 1.4769, "loss/crossentropy": 2.53265917301178, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17999503016471863, "step": 3681 }, { "epoch": 0.05498025220436169, "grad_norm": 0.3359375, "grad_norm_var": 0.0021837711334228515, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.749938130378723, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.18522265553474426, "step": 3682 }, { "epoch": 0.05499518437497667, "grad_norm": 0.400390625, "grad_norm_var": 0.0015085220336914062, "learning_rate": 0.0001, "loss": 1.5014, "loss/crossentropy": 2.6875782012939453, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.17330869287252426, "step": 3683 }, { "epoch": 0.05501011654559165, "grad_norm": 0.54296875, "grad_norm_var": 0.0028906345367431642, "learning_rate": 0.0001, "loss": 1.615, "loss/crossentropy": 2.4406983852386475, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.21265918761491776, "step": 3684 }, { "epoch": 0.05502504871620663, "grad_norm": 0.376953125, "grad_norm_var": 0.0027943929036458332, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.630564570426941, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18102607131004333, "step": 3685 }, { "epoch": 0.05503998088682161, "grad_norm": 0.53515625, "grad_norm_var": 0.003913370768229166, "learning_rate": 0.0001, "loss": 1.6293, "loss/crossentropy": 2.6193530559539795, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.2035113349556923, "step": 3686 }, { "epoch": 0.055054913057436594, "grad_norm": 0.392578125, "grad_norm_var": 0.0037393569946289062, "learning_rate": 0.0001, "loss": 1.5702, "loss/crossentropy": 2.6890358924865723, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.214769184589386, "step": 3687 }, { "epoch": 0.05506984522805158, "grad_norm": 0.361328125, "grad_norm_var": 0.0037035465240478514, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.5303205251693726, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18767345696687698, "step": 3688 }, { "epoch": 0.05508477739866656, "grad_norm": 0.396484375, "grad_norm_var": 0.003698333104451497, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.577683925628662, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18025363981723785, "step": 3689 }, { "epoch": 0.055099709569281535, "grad_norm": 0.361328125, "grad_norm_var": 0.0038097222646077474, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.4243820905685425, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.2112843468785286, "step": 3690 }, { "epoch": 0.05511464173989652, "grad_norm": 0.5390625, "grad_norm_var": 0.004711659749348959, "learning_rate": 0.0001, "loss": 1.7726, "loss/crossentropy": 2.35725736618042, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.2374318689107895, "step": 3691 }, { "epoch": 0.0551295739105115, "grad_norm": 0.376953125, "grad_norm_var": 0.00479577382405599, "learning_rate": 0.0001, "loss": 1.5054, "loss/crossentropy": 2.521610379219055, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1811901330947876, "step": 3692 }, { "epoch": 0.05514450608112648, "grad_norm": 0.390625, "grad_norm_var": 0.004571978251139323, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.5830864906311035, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1898323819041252, "step": 3693 }, { "epoch": 0.055159438251741466, "grad_norm": 0.41796875, "grad_norm_var": 0.004371134440104166, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.7669222354888916, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17556078732013702, "step": 3694 }, { "epoch": 0.05517437042235645, "grad_norm": 0.330078125, "grad_norm_var": 0.004690297444661458, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.499011993408203, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17886338382959366, "step": 3695 }, { "epoch": 0.055189302592971425, "grad_norm": 0.359375, "grad_norm_var": 0.004848162333170573, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.4841572046279907, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.14980823546648026, "step": 3696 }, { "epoch": 0.05520423476358641, "grad_norm": 0.373046875, "grad_norm_var": 0.004895893732706705, "learning_rate": 0.0001, "loss": 1.5005, "loss/crossentropy": 2.464187264442444, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.17631559818983078, "step": 3697 }, { "epoch": 0.05521916693420139, "grad_norm": 0.404296875, "grad_norm_var": 0.004552650451660156, "learning_rate": 0.0001, "loss": 1.5722, "loss/crossentropy": 2.587930202484131, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19331347942352295, "step": 3698 }, { "epoch": 0.05523409910481637, "grad_norm": 0.3515625, "grad_norm_var": 0.004763650894165039, "learning_rate": 0.0001, "loss": 1.5572, "loss/crossentropy": 2.6849676370620728, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.1938750520348549, "step": 3699 }, { "epoch": 0.055249031275431355, "grad_norm": 0.3515625, "grad_norm_var": 0.0035798231760660808, "learning_rate": 0.0001, "loss": 1.5894, "loss/crossentropy": 2.4983690977096558, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2221810668706894, "step": 3700 }, { "epoch": 0.05526396344604633, "grad_norm": 0.53125, "grad_norm_var": 0.004698626200358073, "learning_rate": 0.0001, "loss": 1.5908, "loss/crossentropy": 2.473012685775757, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19233858585357666, "step": 3701 }, { "epoch": 0.055278895616661314, "grad_norm": 0.40625, "grad_norm_var": 0.0034922281901041668, "learning_rate": 0.0001, "loss": 1.5219, "loss/crossentropy": 2.6106337308883667, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18990424275398254, "step": 3702 }, { "epoch": 0.055293827787276296, "grad_norm": 0.4921875, "grad_norm_var": 0.00406047503153483, "learning_rate": 0.0001, "loss": 1.7548, "loss/crossentropy": 2.2518028020858765, "loss/fcd": 1.54296875, "loss/idx": 9.0, "loss/logits": 0.21185297518968582, "step": 3703 }, { "epoch": 0.05530875995789128, "grad_norm": 0.37109375, "grad_norm_var": 0.004012552897135416, "learning_rate": 0.0001, "loss": 1.4569, "loss/crossentropy": 2.7733832597732544, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17958146333694458, "step": 3704 }, { "epoch": 0.05532369212850626, "grad_norm": 0.369140625, "grad_norm_var": 0.004084205627441407, "learning_rate": 0.0001, "loss": 1.4894, "loss/crossentropy": 2.684685230255127, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1964697390794754, "step": 3705 }, { "epoch": 0.055338624299121245, "grad_norm": 0.361328125, "grad_norm_var": 0.004084205627441407, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.6738970279693604, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19303949922323227, "step": 3706 }, { "epoch": 0.05535355646973622, "grad_norm": 0.427734375, "grad_norm_var": 0.0028185367584228514, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.4781343936920166, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1673840433359146, "step": 3707 }, { "epoch": 0.0553684886403512, "grad_norm": 0.349609375, "grad_norm_var": 0.002929798762003581, "learning_rate": 0.0001, "loss": 1.3896, "loss/crossentropy": 2.5251494646072388, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.16694702953100204, "step": 3708 }, { "epoch": 0.055383420810966186, "grad_norm": 0.380859375, "grad_norm_var": 0.0029387791951497396, "learning_rate": 0.0001, "loss": 1.5782, "loss/crossentropy": 2.5023609399795532, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2149159163236618, "step": 3709 }, { "epoch": 0.05539835298158117, "grad_norm": 0.37890625, "grad_norm_var": 0.0029006322224934896, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.6567333936691284, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.16326270997524261, "step": 3710 }, { "epoch": 0.05541328515219615, "grad_norm": 0.37890625, "grad_norm_var": 0.002660226821899414, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.706404447555542, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19452358037233353, "step": 3711 }, { "epoch": 0.055428217322811134, "grad_norm": 0.384765625, "grad_norm_var": 0.0025868733723958333, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.639408230781555, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.22207536548376083, "step": 3712 }, { "epoch": 0.05544314949342611, "grad_norm": 0.36328125, "grad_norm_var": 0.002620808283487956, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.5281091928482056, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.23538661003112793, "step": 3713 }, { "epoch": 0.05545808166404109, "grad_norm": 0.392578125, "grad_norm_var": 0.002613178888956706, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.7200814485549927, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2010410875082016, "step": 3714 }, { "epoch": 0.055473013834656075, "grad_norm": 0.42578125, "grad_norm_var": 0.0025455315907796225, "learning_rate": 0.0001, "loss": 1.7167, "loss/crossentropy": 2.3740601539611816, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.22839871793985367, "step": 3715 }, { "epoch": 0.05548794600527106, "grad_norm": 0.392578125, "grad_norm_var": 0.0023976643880208332, "learning_rate": 0.0001, "loss": 1.5519, "loss/crossentropy": 2.6769533157348633, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19253908842802048, "step": 3716 }, { "epoch": 0.05550287817588604, "grad_norm": 0.41796875, "grad_norm_var": 0.0012231826782226562, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.7813827991485596, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22090502083301544, "step": 3717 }, { "epoch": 0.055517810346501016, "grad_norm": 0.37890625, "grad_norm_var": 0.0012227376302083334, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.633689045906067, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21093353629112244, "step": 3718 }, { "epoch": 0.055532742517116, "grad_norm": 0.349609375, "grad_norm_var": 0.0005810896555582683, "learning_rate": 0.0001, "loss": 1.5104, "loss/crossentropy": 2.4370440244674683, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19794463366270065, "step": 3719 }, { "epoch": 0.05554767468773098, "grad_norm": 0.400390625, "grad_norm_var": 0.000589434305826823, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6751633882522583, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2139742597937584, "step": 3720 }, { "epoch": 0.055562606858345964, "grad_norm": 0.36328125, "grad_norm_var": 0.0006035963694254557, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.7486571073532104, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1831304356455803, "step": 3721 }, { "epoch": 0.05557753902896095, "grad_norm": 0.41796875, "grad_norm_var": 0.0006317138671875, "learning_rate": 0.0001, "loss": 1.5299, "loss/crossentropy": 2.5105955600738525, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19395986944437027, "step": 3722 }, { "epoch": 0.05559247119957593, "grad_norm": 0.375, "grad_norm_var": 0.0005239963531494141, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.7136651277542114, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19700505584478378, "step": 3723 }, { "epoch": 0.055607403370190905, "grad_norm": 0.36328125, "grad_norm_var": 0.000472259521484375, "learning_rate": 0.0001, "loss": 1.4682, "loss/crossentropy": 2.722186326980591, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18306799232959747, "step": 3724 }, { "epoch": 0.05562233554080589, "grad_norm": 0.34765625, "grad_norm_var": 0.0005606174468994141, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.6855839490890503, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1676524430513382, "step": 3725 }, { "epoch": 0.05563726771142087, "grad_norm": 0.400390625, "grad_norm_var": 0.0005772272745768229, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.8815879821777344, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19973523914813995, "step": 3726 }, { "epoch": 0.055652199882035853, "grad_norm": 0.40234375, "grad_norm_var": 0.0005940119425455729, "learning_rate": 0.0001, "loss": 1.6559, "loss/crossentropy": 2.4909443855285645, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2067064866423607, "step": 3727 }, { "epoch": 0.055667132052650836, "grad_norm": 0.39453125, "grad_norm_var": 0.0005983829498291016, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.3917750120162964, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17108170688152313, "step": 3728 }, { "epoch": 0.05568206422326581, "grad_norm": 0.4921875, "grad_norm_var": 0.0012362003326416016, "learning_rate": 0.0001, "loss": 1.5791, "loss/crossentropy": 2.617579221725464, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.20804237574338913, "step": 3729 }, { "epoch": 0.055696996393880795, "grad_norm": 0.365234375, "grad_norm_var": 0.0012904961903889975, "learning_rate": 0.0001, "loss": 1.4686, "loss/crossentropy": 2.6406227350234985, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1795107051730156, "step": 3730 }, { "epoch": 0.05571192856449578, "grad_norm": 0.41015625, "grad_norm_var": 0.001237344741821289, "learning_rate": 0.0001, "loss": 1.6131, "loss/crossentropy": 2.7097063064575195, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.23028813302516937, "step": 3731 }, { "epoch": 0.05572686073511076, "grad_norm": 0.427734375, "grad_norm_var": 0.001317453384399414, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.663530707359314, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.21407821774482727, "step": 3732 }, { "epoch": 0.05574179290572574, "grad_norm": 0.48046875, "grad_norm_var": 0.001759958267211914, "learning_rate": 0.0001, "loss": 1.6161, "loss/crossentropy": 2.7848753929138184, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21763186901807785, "step": 3733 }, { "epoch": 0.055756725076340725, "grad_norm": 0.3984375, "grad_norm_var": 0.0017338911692301431, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.875620484352112, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18040134757757187, "step": 3734 }, { "epoch": 0.0557716572469557, "grad_norm": 0.37890625, "grad_norm_var": 0.0015934626261393228, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.5425866842269897, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18173538148403168, "step": 3735 }, { "epoch": 0.055786589417570684, "grad_norm": 0.39453125, "grad_norm_var": 0.0015961805979410808, "learning_rate": 0.0001, "loss": 1.5926, "loss/crossentropy": 2.8191685676574707, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2254023626446724, "step": 3736 }, { "epoch": 0.05580152158818567, "grad_norm": 0.34765625, "grad_norm_var": 0.0016895135243733723, "learning_rate": 0.0001, "loss": 1.4504, "loss/crossentropy": 2.718753695487976, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17303400486707687, "step": 3737 }, { "epoch": 0.05581645375880065, "grad_norm": 0.36328125, "grad_norm_var": 0.0017438093821207682, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.613106369972229, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18697256594896317, "step": 3738 }, { "epoch": 0.05583138592941563, "grad_norm": 0.66015625, "grad_norm_var": 0.006013727188110352, "learning_rate": 0.0001, "loss": 2.0792, "loss/crossentropy": 2.5534632205963135, "loss/fcd": 1.7578125, "loss/idx": 9.0, "loss/logits": 0.3214232921600342, "step": 3739 }, { "epoch": 0.05584631810003061, "grad_norm": 0.419921875, "grad_norm_var": 0.005829811096191406, "learning_rate": 0.0001, "loss": 1.7323, "loss/crossentropy": 2.467501401901245, "loss/fcd": 1.51953125, "loss/idx": 9.0, "loss/logits": 0.21275699138641357, "step": 3740 }, { "epoch": 0.05586125027064559, "grad_norm": 0.447265625, "grad_norm_var": 0.005519342422485351, "learning_rate": 0.0001, "loss": 1.4813, "loss/crossentropy": 2.7355436086654663, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.16494011133909225, "step": 3741 }, { "epoch": 0.05587618244126057, "grad_norm": 0.396484375, "grad_norm_var": 0.005532566706339518, "learning_rate": 0.0001, "loss": 1.6446, "loss/crossentropy": 2.6781444549560547, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.226677805185318, "step": 3742 }, { "epoch": 0.055891114611875556, "grad_norm": 0.3671875, "grad_norm_var": 0.005709950129191081, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.5764135122299194, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.21050365269184113, "step": 3743 }, { "epoch": 0.05590604678249054, "grad_norm": 0.419921875, "grad_norm_var": 0.005658912658691406, "learning_rate": 0.0001, "loss": 1.4831, "loss/crossentropy": 2.526724934577942, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18235041946172714, "step": 3744 }, { "epoch": 0.05592097895310552, "grad_norm": 0.39453125, "grad_norm_var": 0.005355326334635416, "learning_rate": 0.0001, "loss": 1.5595, "loss/crossentropy": 2.599186420440674, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20008239150047302, "step": 3745 }, { "epoch": 0.0559359111237205, "grad_norm": 0.36328125, "grad_norm_var": 0.005369043350219727, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.4608147144317627, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1628817617893219, "step": 3746 }, { "epoch": 0.05595084329433548, "grad_norm": 0.390625, "grad_norm_var": 0.0054103692372639975, "learning_rate": 0.0001, "loss": 1.4507, "loss/crossentropy": 2.5085036754608154, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17723850160837173, "step": 3747 }, { "epoch": 0.05596577546495046, "grad_norm": 0.376953125, "grad_norm_var": 0.005489714940388997, "learning_rate": 0.0001, "loss": 1.5338, "loss/crossentropy": 2.769017457962036, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20563380420207977, "step": 3748 }, { "epoch": 0.055980707635565445, "grad_norm": 0.390625, "grad_norm_var": 0.005179707209269206, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.6370351314544678, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1650635451078415, "step": 3749 }, { "epoch": 0.05599563980618043, "grad_norm": 0.416015625, "grad_norm_var": 0.005179278055826823, "learning_rate": 0.0001, "loss": 1.4544, "loss/crossentropy": 2.7494232654571533, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.19270864129066467, "step": 3750 }, { "epoch": 0.056010571976795404, "grad_norm": 0.408203125, "grad_norm_var": 0.005119434992472331, "learning_rate": 0.0001, "loss": 1.5861, "loss/crossentropy": 2.6390587091445923, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.21504950523376465, "step": 3751 }, { "epoch": 0.056025504147410386, "grad_norm": 0.369140625, "grad_norm_var": 0.005211385091145834, "learning_rate": 0.0001, "loss": 1.6049, "loss/crossentropy": 2.579266905784607, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21819621324539185, "step": 3752 }, { "epoch": 0.05604043631802537, "grad_norm": 0.337890625, "grad_norm_var": 0.005296182632446289, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.357532501220703, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19095125049352646, "step": 3753 }, { "epoch": 0.05605536848864035, "grad_norm": 0.375, "grad_norm_var": 0.005235528945922852, "learning_rate": 0.0001, "loss": 1.7371, "loss/crossentropy": 2.5350353717803955, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.2371249869465828, "step": 3754 }, { "epoch": 0.056070300659255334, "grad_norm": 0.3671875, "grad_norm_var": 0.0007627964019775391, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.669775128364563, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1972559466958046, "step": 3755 }, { "epoch": 0.05608523282987032, "grad_norm": 0.3671875, "grad_norm_var": 0.000726318359375, "learning_rate": 0.0001, "loss": 1.694, "loss/crossentropy": 2.6114906072616577, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.23697996884584427, "step": 3756 }, { "epoch": 0.05610016500048529, "grad_norm": 0.37890625, "grad_norm_var": 0.0004665215810139974, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.7963480949401855, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.186898872256279, "step": 3757 }, { "epoch": 0.056115097171100276, "grad_norm": 0.36328125, "grad_norm_var": 0.00047327677408854165, "learning_rate": 0.0001, "loss": 1.5299, "loss/crossentropy": 2.5553783178329468, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19400957226753235, "step": 3758 }, { "epoch": 0.05613002934171526, "grad_norm": 0.375, "grad_norm_var": 0.0004633585611979167, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.6183362007141113, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1793874278664589, "step": 3759 }, { "epoch": 0.05614496151233024, "grad_norm": 0.330078125, "grad_norm_var": 0.0004999160766601563, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.581301689147949, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.19321970641613007, "step": 3760 }, { "epoch": 0.056159893682945224, "grad_norm": 0.421875, "grad_norm_var": 0.0006169637044270833, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.662195324897766, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17661288380622864, "step": 3761 }, { "epoch": 0.0561748258535602, "grad_norm": 0.345703125, "grad_norm_var": 0.0006683190663655599, "learning_rate": 0.0001, "loss": 1.4732, "loss/crossentropy": 2.7082637548446655, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.19193704426288605, "step": 3762 }, { "epoch": 0.05618975802417518, "grad_norm": 0.375, "grad_norm_var": 0.0006528059641520182, "learning_rate": 0.0001, "loss": 1.4645, "loss/crossentropy": 2.477115750312805, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17936262488365173, "step": 3763 }, { "epoch": 0.056204690194790165, "grad_norm": 0.35546875, "grad_norm_var": 0.0006757100423177083, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.720965027809143, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.19449830055236816, "step": 3764 }, { "epoch": 0.05621962236540515, "grad_norm": 0.37890625, "grad_norm_var": 0.0006575902303059896, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.739427089691162, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1882271096110344, "step": 3765 }, { "epoch": 0.05623455453602013, "grad_norm": 0.349609375, "grad_norm_var": 0.0005505879720052084, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.805173397064209, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18703017383813858, "step": 3766 }, { "epoch": 0.05624948670663511, "grad_norm": 0.8359375, "grad_norm_var": 0.01424101193745931, "learning_rate": 0.0001, "loss": 1.7867, "loss/crossentropy": 2.3584073185920715, "loss/fcd": 1.546875, "loss/idx": 9.0, "loss/logits": 0.23986849188804626, "step": 3767 }, { "epoch": 0.05626441887725009, "grad_norm": 0.33984375, "grad_norm_var": 0.014397176106770833, "learning_rate": 0.0001, "loss": 1.4476, "loss/crossentropy": 2.4815826416015625, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17415082454681396, "step": 3768 }, { "epoch": 0.05627935104786507, "grad_norm": 0.42578125, "grad_norm_var": 0.014227660497029622, "learning_rate": 0.0001, "loss": 1.5986, "loss/crossentropy": 2.508493661880493, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.21577561646699905, "step": 3769 }, { "epoch": 0.056294283218480054, "grad_norm": 0.404296875, "grad_norm_var": 0.014187367757161458, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.5545494556427, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.16314414143562317, "step": 3770 }, { "epoch": 0.05630921538909504, "grad_norm": 0.408203125, "grad_norm_var": 0.01410826047261556, "learning_rate": 0.0001, "loss": 1.6477, "loss/crossentropy": 2.4125750064849854, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2180478423833847, "step": 3771 }, { "epoch": 0.05632414755971002, "grad_norm": 0.43359375, "grad_norm_var": 0.014062865575154623, "learning_rate": 0.0001, "loss": 1.5158, "loss/crossentropy": 2.8013776540756226, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.20725039392709732, "step": 3772 }, { "epoch": 0.056339079730325, "grad_norm": 0.35546875, "grad_norm_var": 0.014186843236287435, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.5758689641952515, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18188168108463287, "step": 3773 }, { "epoch": 0.05635401190093998, "grad_norm": 0.390625, "grad_norm_var": 0.014077361424763997, "learning_rate": 0.0001, "loss": 1.5804, "loss/crossentropy": 2.736787796020508, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.19366531819105148, "step": 3774 }, { "epoch": 0.05636894407155496, "grad_norm": 0.333984375, "grad_norm_var": 0.014362080891927084, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.4224735498428345, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19996365159749985, "step": 3775 }, { "epoch": 0.05638387624216994, "grad_norm": 0.408203125, "grad_norm_var": 0.01396026611328125, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.5120980739593506, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20002592355012894, "step": 3776 }, { "epoch": 0.056398808412784926, "grad_norm": 0.447265625, "grad_norm_var": 0.014040231704711914, "learning_rate": 0.0001, "loss": 1.5827, "loss/crossentropy": 2.7680128812789917, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.1920301616191864, "step": 3777 }, { "epoch": 0.05641374058339991, "grad_norm": 0.380859375, "grad_norm_var": 0.013807916641235351, "learning_rate": 0.0001, "loss": 1.4291, "loss/crossentropy": 2.680113434791565, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17522357404232025, "step": 3778 }, { "epoch": 0.056428672754014884, "grad_norm": 0.44140625, "grad_norm_var": 0.013738743464152018, "learning_rate": 0.0001, "loss": 1.6453, "loss/crossentropy": 2.505802869796753, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.21951545774936676, "step": 3779 }, { "epoch": 0.05644360492462987, "grad_norm": 0.373046875, "grad_norm_var": 0.013611284891764323, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.706306219100952, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17533015459775925, "step": 3780 }, { "epoch": 0.05645853709524485, "grad_norm": 0.375, "grad_norm_var": 0.013633219401041667, "learning_rate": 0.0001, "loss": 1.68, "loss/crossentropy": 2.5189425945281982, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.25810082256793976, "step": 3781 }, { "epoch": 0.05647346926585983, "grad_norm": 0.396484375, "grad_norm_var": 0.013337198893229167, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.6869864463806152, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16219748556613922, "step": 3782 }, { "epoch": 0.056488401436474815, "grad_norm": 0.419921875, "grad_norm_var": 0.0011864821116129556, "learning_rate": 0.0001, "loss": 1.6299, "loss/crossentropy": 2.4763442277908325, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21196503192186356, "step": 3783 }, { "epoch": 0.0565033336070898, "grad_norm": 0.37109375, "grad_norm_var": 0.0010140577952067057, "learning_rate": 0.0001, "loss": 1.5867, "loss/crossentropy": 2.6497581005096436, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21953890472650528, "step": 3784 }, { "epoch": 0.056518265777704774, "grad_norm": 0.462890625, "grad_norm_var": 0.0012384414672851562, "learning_rate": 0.0001, "loss": 1.6938, "loss/crossentropy": 2.682197093963623, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.22500663995742798, "step": 3785 }, { "epoch": 0.056533197948319756, "grad_norm": 0.369140625, "grad_norm_var": 0.001296234130859375, "learning_rate": 0.0001, "loss": 1.4686, "loss/crossentropy": 2.4299134016036987, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1834033653140068, "step": 3786 }, { "epoch": 0.05654813011893474, "grad_norm": 0.3984375, "grad_norm_var": 0.0012888431549072266, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.6030280590057373, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.16946090012788773, "step": 3787 }, { "epoch": 0.05656306228954972, "grad_norm": 0.40625, "grad_norm_var": 0.0012033939361572265, "learning_rate": 0.0001, "loss": 1.5588, "loss/crossentropy": 2.4874579906463623, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19552797824144363, "step": 3788 }, { "epoch": 0.056577994460164704, "grad_norm": 0.396484375, "grad_norm_var": 0.0010889053344726562, "learning_rate": 0.0001, "loss": 1.6623, "loss/crossentropy": 2.3872991800308228, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.20918622612953186, "step": 3789 }, { "epoch": 0.05659292663077968, "grad_norm": 0.375, "grad_norm_var": 0.0011199315388997395, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.757049083709717, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18322757631540298, "step": 3790 }, { "epoch": 0.05660785880139466, "grad_norm": 0.474609375, "grad_norm_var": 0.0011702855428059897, "learning_rate": 0.0001, "loss": 1.7033, "loss/crossentropy": 2.959465980529785, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.21503660827875137, "step": 3791 }, { "epoch": 0.056622790972009646, "grad_norm": 0.408203125, "grad_norm_var": 0.0011702855428059897, "learning_rate": 0.0001, "loss": 1.7295, "loss/crossentropy": 2.519634962081909, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.23726502060890198, "step": 3792 }, { "epoch": 0.05663772314262463, "grad_norm": 0.44921875, "grad_norm_var": 0.0011812686920166016, "learning_rate": 0.0001, "loss": 1.5999, "loss/crossentropy": 2.8559629917144775, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.19366700947284698, "step": 3793 }, { "epoch": 0.05665265531323961, "grad_norm": 0.375, "grad_norm_var": 0.001203155517578125, "learning_rate": 0.0001, "loss": 1.6039, "loss/crossentropy": 2.2534974813461304, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.22501251101493835, "step": 3794 }, { "epoch": 0.056667587483854594, "grad_norm": 0.384765625, "grad_norm_var": 0.0011344750722249348, "learning_rate": 0.0001, "loss": 1.4605, "loss/crossentropy": 2.46198046207428, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17921528220176697, "step": 3795 }, { "epoch": 0.05668251965446957, "grad_norm": 0.388671875, "grad_norm_var": 0.0010889530181884765, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.4556630849838257, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.18943732976913452, "step": 3796 }, { "epoch": 0.05669745182508455, "grad_norm": 0.447265625, "grad_norm_var": 0.001143646240234375, "learning_rate": 0.0001, "loss": 1.6257, "loss/crossentropy": 2.6255940198898315, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.203867569565773, "step": 3797 }, { "epoch": 0.056712383995699535, "grad_norm": 1.125, "grad_norm_var": 0.03322370847066244, "learning_rate": 0.0001, "loss": 2.2323, "loss/crossentropy": 2.579520106315613, "loss/fcd": 1.8046875, "loss/idx": 9.0, "loss/logits": 0.4275740385055542, "step": 3798 }, { "epoch": 0.05672731616631452, "grad_norm": 0.40625, "grad_norm_var": 0.03329614003499349, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.6353012323379517, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19015759229660034, "step": 3799 }, { "epoch": 0.0567422483369295, "grad_norm": 0.41796875, "grad_norm_var": 0.032925351460774736, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.730196714401245, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.16317017376422882, "step": 3800 }, { "epoch": 0.056757180507544476, "grad_norm": 0.4453125, "grad_norm_var": 0.03292692502339681, "learning_rate": 0.0001, "loss": 1.6254, "loss/crossentropy": 2.6941781044006348, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21911291033029556, "step": 3801 }, { "epoch": 0.05677211267815946, "grad_norm": 0.3984375, "grad_norm_var": 0.03264821370442708, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.4746668338775635, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.20395252853631973, "step": 3802 }, { "epoch": 0.05678704484877444, "grad_norm": 0.3984375, "grad_norm_var": 0.03264821370442708, "learning_rate": 0.0001, "loss": 1.5387, "loss/crossentropy": 2.6892656087875366, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.18326956033706665, "step": 3803 }, { "epoch": 0.056801977019389424, "grad_norm": 0.37890625, "grad_norm_var": 0.03287652333577474, "learning_rate": 0.0001, "loss": 1.6524, "loss/crossentropy": 2.4644945859909058, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2148839458823204, "step": 3804 }, { "epoch": 0.05681690919000441, "grad_norm": 0.44140625, "grad_norm_var": 0.03265608151753743, "learning_rate": 0.0001, "loss": 1.5343, "loss/crossentropy": 2.6071112155914307, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.2022630274295807, "step": 3805 }, { "epoch": 0.05683184136061939, "grad_norm": 0.38671875, "grad_norm_var": 0.03253630002339681, "learning_rate": 0.0001, "loss": 1.5386, "loss/crossentropy": 2.7112845182418823, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20262838900089264, "step": 3806 }, { "epoch": 0.056846773531234365, "grad_norm": 0.3984375, "grad_norm_var": 0.032729085286458334, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.6389410495758057, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1739434152841568, "step": 3807 }, { "epoch": 0.05686170570184935, "grad_norm": 0.40234375, "grad_norm_var": 0.032766326268513994, "learning_rate": 0.0001, "loss": 1.543, "loss/crossentropy": 2.6270145177841187, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19142841547727585, "step": 3808 }, { "epoch": 0.05687663787246433, "grad_norm": 0.341796875, "grad_norm_var": 0.03353824615478516, "learning_rate": 0.0001, "loss": 1.5572, "loss/crossentropy": 2.6101393699645996, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20559698343276978, "step": 3809 }, { "epoch": 0.05689157004307931, "grad_norm": 0.384765625, "grad_norm_var": 0.03345170021057129, "learning_rate": 0.0001, "loss": 1.5346, "loss/crossentropy": 2.528368353843689, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19087176024913788, "step": 3810 }, { "epoch": 0.056906502213694296, "grad_norm": 0.474609375, "grad_norm_var": 0.03321480751037598, "learning_rate": 0.0001, "loss": 1.5623, "loss/crossentropy": 2.678430676460266, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19509097933769226, "step": 3811 }, { "epoch": 0.05692143438430927, "grad_norm": 0.375, "grad_norm_var": 0.03334242502848307, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.4095312356948853, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17844344675540924, "step": 3812 }, { "epoch": 0.056936366554924255, "grad_norm": 0.390625, "grad_norm_var": 0.03357427914937337, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.5598933696746826, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1811675727367401, "step": 3813 }, { "epoch": 0.05695129872553924, "grad_norm": 0.37109375, "grad_norm_var": 0.0010326226552327474, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.656238317489624, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.2008385807275772, "step": 3814 }, { "epoch": 0.05696623089615422, "grad_norm": 0.65234375, "grad_norm_var": 0.004998000462849935, "learning_rate": 0.0001, "loss": 1.8085, "loss/crossentropy": 2.5620462894439697, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.24204862862825394, "step": 3815 }, { "epoch": 0.0569811630667692, "grad_norm": 0.3671875, "grad_norm_var": 0.00514677365620931, "learning_rate": 0.0001, "loss": 1.5591, "loss/crossentropy": 2.6094738245010376, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.21141008287668228, "step": 3816 }, { "epoch": 0.056996095237384185, "grad_norm": 0.396484375, "grad_norm_var": 0.005085182189941406, "learning_rate": 0.0001, "loss": 1.6791, "loss/crossentropy": 2.4573291540145874, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2298642247915268, "step": 3817 }, { "epoch": 0.05701102740799916, "grad_norm": 0.40625, "grad_norm_var": 0.005077044169108073, "learning_rate": 0.0001, "loss": 1.5918, "loss/crossentropy": 2.500963568687439, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.22465456277132034, "step": 3818 }, { "epoch": 0.057025959578614144, "grad_norm": 0.44140625, "grad_norm_var": 0.0051239013671875, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.541113495826721, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20427316427230835, "step": 3819 }, { "epoch": 0.057040891749229126, "grad_norm": 0.400390625, "grad_norm_var": 0.005054839452107747, "learning_rate": 0.0001, "loss": 1.612, "loss/crossentropy": 2.6989128589630127, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.22531332820653915, "step": 3820 }, { "epoch": 0.05705582391984411, "grad_norm": 0.376953125, "grad_norm_var": 0.00508263905843099, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.61294424533844, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1785218045115471, "step": 3821 }, { "epoch": 0.05707075609045909, "grad_norm": 0.390625, "grad_norm_var": 0.005071258544921875, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.751893401145935, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19012700021266937, "step": 3822 }, { "epoch": 0.05708568826107407, "grad_norm": 0.36328125, "grad_norm_var": 0.005205726623535157, "learning_rate": 0.0001, "loss": 1.5466, "loss/crossentropy": 2.5752633810043335, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19508346915245056, "step": 3823 }, { "epoch": 0.05710062043168905, "grad_norm": 0.3984375, "grad_norm_var": 0.005209859212239583, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.762110471725464, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20925069600343704, "step": 3824 }, { "epoch": 0.05711555260230403, "grad_norm": 0.42578125, "grad_norm_var": 0.004907083511352539, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 2.4816421270370483, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.21513229608535767, "step": 3825 }, { "epoch": 0.057130484772919016, "grad_norm": 0.388671875, "grad_norm_var": 0.004893096288045248, "learning_rate": 0.0001, "loss": 1.546, "loss/crossentropy": 2.6432483196258545, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19832369685173035, "step": 3826 }, { "epoch": 0.057145416943534, "grad_norm": 0.384765625, "grad_norm_var": 0.004667901992797851, "learning_rate": 0.0001, "loss": 1.5251, "loss/crossentropy": 2.4950932264328003, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18920005857944489, "step": 3827 }, { "epoch": 0.05716034911414898, "grad_norm": 0.38671875, "grad_norm_var": 0.004624795913696289, "learning_rate": 0.0001, "loss": 1.615, "loss/crossentropy": 2.637043595314026, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20485756546258926, "step": 3828 }, { "epoch": 0.05717528128476396, "grad_norm": 1.0625, "grad_norm_var": 0.031208912531534832, "learning_rate": 0.0001, "loss": 2.2269, "loss/crossentropy": 2.0319610238075256, "loss/fcd": 2.0546875, "loss/idx": 9.0, "loss/logits": 0.17216961830854416, "step": 3829 }, { "epoch": 0.05719021345537894, "grad_norm": 0.376953125, "grad_norm_var": 0.031148783365885415, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.8081737756729126, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1985355168581009, "step": 3830 }, { "epoch": 0.05720514562599392, "grad_norm": 0.421875, "grad_norm_var": 0.02828667958577474, "learning_rate": 0.0001, "loss": 1.4969, "loss/crossentropy": 2.6765776872634888, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18830083310604095, "step": 3831 }, { "epoch": 0.057220077796608905, "grad_norm": 0.46484375, "grad_norm_var": 0.02797673543294271, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.6856452226638794, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.2121018022298813, "step": 3832 }, { "epoch": 0.05723500996722389, "grad_norm": 0.400390625, "grad_norm_var": 0.027953529357910158, "learning_rate": 0.0001, "loss": 1.5416, "loss/crossentropy": 2.4417061805725098, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20171640068292618, "step": 3833 }, { "epoch": 0.05724994213783887, "grad_norm": 0.3984375, "grad_norm_var": 0.027995745340983074, "learning_rate": 0.0001, "loss": 1.6013, "loss/crossentropy": 2.6474087238311768, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20675761997699738, "step": 3834 }, { "epoch": 0.057264874308453846, "grad_norm": 0.359375, "grad_norm_var": 0.028429667154947918, "learning_rate": 0.0001, "loss": 1.3652, "loss/crossentropy": 2.595506429672241, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.1581687033176422, "step": 3835 }, { "epoch": 0.05727980647906883, "grad_norm": 0.345703125, "grad_norm_var": 0.028887176513671876, "learning_rate": 0.0001, "loss": 1.5018, "loss/crossentropy": 2.527846336364746, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1892995834350586, "step": 3836 }, { "epoch": 0.05729473864968381, "grad_norm": 0.359375, "grad_norm_var": 0.02904038429260254, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.643888473510742, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18597479909658432, "step": 3837 }, { "epoch": 0.057309670820298794, "grad_norm": 0.384765625, "grad_norm_var": 0.02907562255859375, "learning_rate": 0.0001, "loss": 1.6995, "loss/crossentropy": 2.684089779853821, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.23470385372638702, "step": 3838 }, { "epoch": 0.05732460299091378, "grad_norm": 0.443359375, "grad_norm_var": 0.028736098607381185, "learning_rate": 0.0001, "loss": 1.7641, "loss/crossentropy": 2.803832769393921, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.27190401405096054, "step": 3839 }, { "epoch": 0.05733953516152875, "grad_norm": 0.375, "grad_norm_var": 0.028892882664998374, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.818521499633789, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2078854665160179, "step": 3840 }, { "epoch": 0.057354467332143735, "grad_norm": 0.33203125, "grad_norm_var": 0.029571898778279624, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.5548449754714966, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18576644361019135, "step": 3841 }, { "epoch": 0.05736939950275872, "grad_norm": 0.380859375, "grad_norm_var": 0.02961907386779785, "learning_rate": 0.0001, "loss": 1.6471, "loss/crossentropy": 2.5344449281692505, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.21746084094047546, "step": 3842 }, { "epoch": 0.0573843316733737, "grad_norm": 0.390625, "grad_norm_var": 0.029586029052734376, "learning_rate": 0.0001, "loss": 1.5726, "loss/crossentropy": 2.438655376434326, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20927318930625916, "step": 3843 }, { "epoch": 0.057399263843988683, "grad_norm": 0.40234375, "grad_norm_var": 0.02951075236002604, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.3320525884628296, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.22827135771512985, "step": 3844 }, { "epoch": 0.057414196014603666, "grad_norm": 0.3671875, "grad_norm_var": 0.0011957804361979166, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.5435558557510376, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18299423903226852, "step": 3845 }, { "epoch": 0.05742912818521864, "grad_norm": 0.41796875, "grad_norm_var": 0.0012421766916910807, "learning_rate": 0.0001, "loss": 1.5855, "loss/crossentropy": 2.6868897676467896, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.1949010193347931, "step": 3846 }, { "epoch": 0.057444060355833625, "grad_norm": 0.349609375, "grad_norm_var": 0.0012639363606770833, "learning_rate": 0.0001, "loss": 1.4517, "loss/crossentropy": 2.5532137155532837, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.1743621528148651, "step": 3847 }, { "epoch": 0.05745899252644861, "grad_norm": 0.431640625, "grad_norm_var": 0.00098265012105306, "learning_rate": 0.0001, "loss": 1.9917, "loss/crossentropy": 2.4235039949417114, "loss/fcd": 1.71875, "loss/idx": 9.0, "loss/logits": 0.2729561999440193, "step": 3848 }, { "epoch": 0.05747392469706359, "grad_norm": 0.33203125, "grad_norm_var": 0.001122283935546875, "learning_rate": 0.0001, "loss": 1.4783, "loss/crossentropy": 2.6111477613449097, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18922605365514755, "step": 3849 }, { "epoch": 0.05748885686767857, "grad_norm": 0.46875, "grad_norm_var": 0.00160980224609375, "learning_rate": 0.0001, "loss": 1.7148, "loss/crossentropy": 2.3556206226348877, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.21869590878486633, "step": 3850 }, { "epoch": 0.05750378903829355, "grad_norm": 0.396484375, "grad_norm_var": 0.0015750726064046223, "learning_rate": 0.0001, "loss": 1.7183, "loss/crossentropy": 2.582785725593567, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.2377937138080597, "step": 3851 }, { "epoch": 0.05751872120890853, "grad_norm": 0.361328125, "grad_norm_var": 0.0015061537424723307, "learning_rate": 0.0001, "loss": 1.4171, "loss/crossentropy": 2.7002570629119873, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.1749514415860176, "step": 3852 }, { "epoch": 0.057533653379523514, "grad_norm": 0.380859375, "grad_norm_var": 0.0014556248982747395, "learning_rate": 0.0001, "loss": 1.6072, "loss/crossentropy": 2.498050093650818, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.21269278228282928, "step": 3853 }, { "epoch": 0.0575485855501385, "grad_norm": 0.384765625, "grad_norm_var": 0.0014556248982747395, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.535312294960022, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18761371821165085, "step": 3854 }, { "epoch": 0.05756351772075348, "grad_norm": 0.427734375, "grad_norm_var": 0.0013564427693684895, "learning_rate": 0.0001, "loss": 1.6973, "loss/crossentropy": 2.588461399078369, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.22462613880634308, "step": 3855 }, { "epoch": 0.05757844989136846, "grad_norm": 0.443359375, "grad_norm_var": 0.0015350182851155598, "learning_rate": 0.0001, "loss": 1.5476, "loss/crossentropy": 2.662098526954651, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19600296020507812, "step": 3856 }, { "epoch": 0.05759338206198344, "grad_norm": 0.373046875, "grad_norm_var": 0.0013137181599934896, "learning_rate": 0.0001, "loss": 1.6066, "loss/crossentropy": 2.5224294662475586, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.21599064767360687, "step": 3857 }, { "epoch": 0.05760831423259842, "grad_norm": 0.380859375, "grad_norm_var": 0.0013137181599934896, "learning_rate": 0.0001, "loss": 1.5641, "loss/crossentropy": 2.562440037727356, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20084908604621887, "step": 3858 }, { "epoch": 0.0576232464032134, "grad_norm": 0.478515625, "grad_norm_var": 0.0017536004384358724, "learning_rate": 0.0001, "loss": 1.7132, "loss/crossentropy": 2.4552167654037476, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.20537853240966797, "step": 3859 }, { "epoch": 0.057638178573828386, "grad_norm": 0.37890625, "grad_norm_var": 0.001779921849568685, "learning_rate": 0.0001, "loss": 1.6463, "loss/crossentropy": 2.592800259590149, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.24007418751716614, "step": 3860 }, { "epoch": 0.05765311074444337, "grad_norm": 0.40625, "grad_norm_var": 0.0017131646474202475, "learning_rate": 0.0001, "loss": 1.6138, "loss/crossentropy": 2.5558592081069946, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.19578363001346588, "step": 3861 }, { "epoch": 0.057668042915058344, "grad_norm": 0.390625, "grad_norm_var": 0.0016971429189046223, "learning_rate": 0.0001, "loss": 1.4315, "loss/crossentropy": 2.6123911142349243, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16976295411586761, "step": 3862 }, { "epoch": 0.05768297508567333, "grad_norm": 0.36328125, "grad_norm_var": 0.0016187032063802083, "learning_rate": 0.0001, "loss": 1.6433, "loss/crossentropy": 2.557361602783203, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.2175028771162033, "step": 3863 }, { "epoch": 0.05769790725628831, "grad_norm": 0.412109375, "grad_norm_var": 0.001559893290201823, "learning_rate": 0.0001, "loss": 1.611, "loss/crossentropy": 2.697765350341797, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2203601896762848, "step": 3864 }, { "epoch": 0.05771283942690329, "grad_norm": 0.345703125, "grad_norm_var": 0.0014500776926676432, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.48856520652771, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18914002925157547, "step": 3865 }, { "epoch": 0.057727771597518275, "grad_norm": 0.44140625, "grad_norm_var": 0.0012444655100504558, "learning_rate": 0.0001, "loss": 1.6462, "loss/crossentropy": 2.806189179420471, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.23603995144367218, "step": 3866 }, { "epoch": 0.05774270376813326, "grad_norm": 0.365234375, "grad_norm_var": 0.0013110955556233724, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.515073299407959, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.17806632071733475, "step": 3867 }, { "epoch": 0.057757635938748234, "grad_norm": 0.4140625, "grad_norm_var": 0.0012420018513997395, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.4918575286865234, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.1938723921775818, "step": 3868 }, { "epoch": 0.057772568109363216, "grad_norm": 0.455078125, "grad_norm_var": 0.0014050801595052083, "learning_rate": 0.0001, "loss": 1.6237, "loss/crossentropy": 2.702086925506592, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21746251732110977, "step": 3869 }, { "epoch": 0.0577875002799782, "grad_norm": 0.419921875, "grad_norm_var": 0.0013930638631184896, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.6180717945098877, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19868524372577667, "step": 3870 }, { "epoch": 0.05780243245059318, "grad_norm": 0.41796875, "grad_norm_var": 0.0013707319895426432, "learning_rate": 0.0001, "loss": 1.5691, "loss/crossentropy": 2.634122371673584, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2097305953502655, "step": 3871 }, { "epoch": 0.057817364621208164, "grad_norm": 0.357421875, "grad_norm_var": 0.0013973077138264975, "learning_rate": 0.0001, "loss": 1.4871, "loss/crossentropy": 2.6649218797683716, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18244998157024384, "step": 3872 }, { "epoch": 0.05783229679182314, "grad_norm": 0.35546875, "grad_norm_var": 0.0014798482259114583, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.373100161552429, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.17527084052562714, "step": 3873 }, { "epoch": 0.05784722896243812, "grad_norm": 0.353515625, "grad_norm_var": 0.0015924453735351562, "learning_rate": 0.0001, "loss": 1.5048, "loss/crossentropy": 2.595366358757019, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19226723909378052, "step": 3874 }, { "epoch": 0.057862161133053105, "grad_norm": 0.49609375, "grad_norm_var": 0.0018023014068603515, "learning_rate": 0.0001, "loss": 1.8055, "loss/crossentropy": 2.689773440361023, "loss/fcd": 1.55859375, "loss/idx": 9.0, "loss/logits": 0.24686752259731293, "step": 3875 }, { "epoch": 0.05787709330366809, "grad_norm": 0.390625, "grad_norm_var": 0.001780557632446289, "learning_rate": 0.0001, "loss": 1.5912, "loss/crossentropy": 2.349597692489624, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19275879859924316, "step": 3876 }, { "epoch": 0.05789202547428307, "grad_norm": 0.37109375, "grad_norm_var": 0.0018240451812744141, "learning_rate": 0.0001, "loss": 1.5573, "loss/crossentropy": 2.710457921028137, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20183181762695312, "step": 3877 }, { "epoch": 0.057906957644898054, "grad_norm": 0.4140625, "grad_norm_var": 0.0018389225006103516, "learning_rate": 0.0001, "loss": 1.6834, "loss/crossentropy": 2.562028408050537, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2341870814561844, "step": 3878 }, { "epoch": 0.05792188981551303, "grad_norm": 0.41015625, "grad_norm_var": 0.0017572879791259766, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 2.8787624835968018, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20495379716157913, "step": 3879 }, { "epoch": 0.05793682198612801, "grad_norm": 0.361328125, "grad_norm_var": 0.0018448988596598307, "learning_rate": 0.0001, "loss": 1.4187, "loss/crossentropy": 2.5732473134994507, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16480334103107452, "step": 3880 }, { "epoch": 0.057951754156742995, "grad_norm": 0.3359375, "grad_norm_var": 0.0019190470377604167, "learning_rate": 0.0001, "loss": 1.4811, "loss/crossentropy": 2.7208263874053955, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19199632108211517, "step": 3881 }, { "epoch": 0.05796668632735798, "grad_norm": 0.3671875, "grad_norm_var": 0.001828447977701823, "learning_rate": 0.0001, "loss": 1.6361, "loss/crossentropy": 2.5586124658584595, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21808511018753052, "step": 3882 }, { "epoch": 0.05798161849797296, "grad_norm": 0.37890625, "grad_norm_var": 0.0017898400624593099, "learning_rate": 0.0001, "loss": 1.5282, "loss/crossentropy": 2.4479217529296875, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18446940928697586, "step": 3883 }, { "epoch": 0.057996550668587936, "grad_norm": 0.337890625, "grad_norm_var": 0.0019454320271809896, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.6216460466384888, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16904866695404053, "step": 3884 }, { "epoch": 0.05801148283920292, "grad_norm": 0.400390625, "grad_norm_var": 0.0016499201456705729, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.8111345767974854, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17237849533557892, "step": 3885 }, { "epoch": 0.0580264150098179, "grad_norm": 0.4453125, "grad_norm_var": 0.0018067518870035807, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.6142154932022095, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.20600423216819763, "step": 3886 }, { "epoch": 0.058041347180432884, "grad_norm": 0.388671875, "grad_norm_var": 0.0017397562662760417, "learning_rate": 0.0001, "loss": 1.5985, "loss/crossentropy": 2.5649173259735107, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.19615919142961502, "step": 3887 }, { "epoch": 0.05805627935104787, "grad_norm": 0.375, "grad_norm_var": 0.0016938368479410806, "learning_rate": 0.0001, "loss": 1.4729, "loss/crossentropy": 2.6804561614990234, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.19160328805446625, "step": 3888 }, { "epoch": 0.05807121152166285, "grad_norm": 0.33203125, "grad_norm_var": 0.0018246809641520182, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.5769530534744263, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.18044404685497284, "step": 3889 }, { "epoch": 0.058086143692277825, "grad_norm": 0.4140625, "grad_norm_var": 0.001800537109375, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.859683036804199, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1833595633506775, "step": 3890 }, { "epoch": 0.05810107586289281, "grad_norm": 0.396484375, "grad_norm_var": 0.000993967056274414, "learning_rate": 0.0001, "loss": 1.3385, "loss/crossentropy": 2.4655245542526245, "loss/fcd": 1.19140625, "loss/idx": 9.0, "loss/logits": 0.14705152064561844, "step": 3891 }, { "epoch": 0.05811600803350779, "grad_norm": 0.431640625, "grad_norm_var": 0.0011438369750976563, "learning_rate": 0.0001, "loss": 1.6182, "loss/crossentropy": 2.681907892227173, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21198581904172897, "step": 3892 }, { "epoch": 0.05813094020412277, "grad_norm": 0.375, "grad_norm_var": 0.001137542724609375, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.663278102874756, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.16648539900779724, "step": 3893 }, { "epoch": 0.058145872374737756, "grad_norm": 0.345703125, "grad_norm_var": 0.0011670271555582683, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.6788251399993896, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18469743430614471, "step": 3894 }, { "epoch": 0.05816080454535274, "grad_norm": 0.3984375, "grad_norm_var": 0.0011300245920817058, "learning_rate": 0.0001, "loss": 1.5949, "loss/crossentropy": 2.3379160165786743, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.18869974464178085, "step": 3895 }, { "epoch": 0.058175736715967714, "grad_norm": 0.392578125, "grad_norm_var": 0.001112222671508789, "learning_rate": 0.0001, "loss": 1.5308, "loss/crossentropy": 2.5711464881896973, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.17928268760442734, "step": 3896 }, { "epoch": 0.0581906688865827, "grad_norm": 0.353515625, "grad_norm_var": 0.001023101806640625, "learning_rate": 0.0001, "loss": 1.4433, "loss/crossentropy": 2.643117904663086, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.18159063160419464, "step": 3897 }, { "epoch": 0.05820560105719768, "grad_norm": 0.330078125, "grad_norm_var": 0.0011888980865478516, "learning_rate": 0.0001, "loss": 1.4364, "loss/crossentropy": 2.496930718421936, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17082472145557404, "step": 3898 }, { "epoch": 0.05822053322781266, "grad_norm": 0.333984375, "grad_norm_var": 0.0013274510701497396, "learning_rate": 0.0001, "loss": 1.4946, "loss/crossentropy": 2.667851686477661, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1899471953511238, "step": 3899 }, { "epoch": 0.058235465398427645, "grad_norm": 0.35546875, "grad_norm_var": 0.0012523492177327475, "learning_rate": 0.0001, "loss": 1.3388, "loss/crossentropy": 2.5564874410629272, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.13181119039654732, "step": 3900 }, { "epoch": 0.05825039756904262, "grad_norm": 0.365234375, "grad_norm_var": 0.001230605443318685, "learning_rate": 0.0001, "loss": 1.5181, "loss/crossentropy": 2.6260217428207397, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19384204596281052, "step": 3901 }, { "epoch": 0.058265329739657604, "grad_norm": 0.392578125, "grad_norm_var": 0.0009246190388997396, "learning_rate": 0.0001, "loss": 1.4786, "loss/crossentropy": 2.4464505910873413, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18170516192913055, "step": 3902 }, { "epoch": 0.058280261910272586, "grad_norm": 0.384765625, "grad_norm_var": 0.000917816162109375, "learning_rate": 0.0001, "loss": 1.5651, "loss/crossentropy": 2.545528292655945, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19014324247837067, "step": 3903 }, { "epoch": 0.05829519408088757, "grad_norm": 0.416015625, "grad_norm_var": 0.0010309696197509765, "learning_rate": 0.0001, "loss": 1.7109, "loss/crossentropy": 2.6628841161727905, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.22656091302633286, "step": 3904 }, { "epoch": 0.05831012625150255, "grad_norm": 0.359375, "grad_norm_var": 0.0009170373280843099, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.5724029541015625, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1725471466779709, "step": 3905 }, { "epoch": 0.058325058422117534, "grad_norm": 0.4140625, "grad_norm_var": 0.0009170373280843099, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.5526294708251953, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1891271471977234, "step": 3906 }, { "epoch": 0.05833999059273251, "grad_norm": 0.546875, "grad_norm_var": 0.0027051289876302083, "learning_rate": 0.0001, "loss": 1.7227, "loss/crossentropy": 2.460437536239624, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.26956476271152496, "step": 3907 }, { "epoch": 0.05835492276334749, "grad_norm": 0.443359375, "grad_norm_var": 0.0027831395467122397, "learning_rate": 0.0001, "loss": 1.6202, "loss/crossentropy": 2.369199514389038, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.18273650109767914, "step": 3908 }, { "epoch": 0.058369854933962476, "grad_norm": 0.361328125, "grad_norm_var": 0.0028184096018473307, "learning_rate": 0.0001, "loss": 1.534, "loss/crossentropy": 2.580521583557129, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19807377457618713, "step": 3909 }, { "epoch": 0.05838478710457746, "grad_norm": 0.427734375, "grad_norm_var": 0.0027863661448160808, "learning_rate": 0.0001, "loss": 1.6194, "loss/crossentropy": 2.3149741888046265, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.21701885014772415, "step": 3910 }, { "epoch": 0.05839971927519244, "grad_norm": 0.38671875, "grad_norm_var": 0.0027852217356363934, "learning_rate": 0.0001, "loss": 1.7613, "loss/crossentropy": 2.5987417697906494, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.28470752388238907, "step": 3911 }, { "epoch": 0.05841465144580742, "grad_norm": 0.34765625, "grad_norm_var": 0.0029047648111979168, "learning_rate": 0.0001, "loss": 1.4367, "loss/crossentropy": 2.4557379484176636, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17494837939739227, "step": 3912 }, { "epoch": 0.0584295836164224, "grad_norm": 0.41796875, "grad_norm_var": 0.002862278620402018, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.7287439107894897, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19367718696594238, "step": 3913 }, { "epoch": 0.05844451578703738, "grad_norm": 0.455078125, "grad_norm_var": 0.002795139948527018, "learning_rate": 0.0001, "loss": 1.7023, "loss/crossentropy": 2.77758526802063, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.22576259076595306, "step": 3914 }, { "epoch": 0.058459447957652365, "grad_norm": 0.376953125, "grad_norm_var": 0.0025293827056884766, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.6605623960494995, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19213218986988068, "step": 3915 }, { "epoch": 0.05847438012826735, "grad_norm": 0.3671875, "grad_norm_var": 0.0024633884429931642, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.488508105278015, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.17714009433984756, "step": 3916 }, { "epoch": 0.05848931229888233, "grad_norm": 0.4140625, "grad_norm_var": 0.0023604710896809895, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.7332743406295776, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.18732882291078568, "step": 3917 }, { "epoch": 0.058504244469497306, "grad_norm": 0.36328125, "grad_norm_var": 0.0024703820546468098, "learning_rate": 0.0001, "loss": 1.5542, "loss/crossentropy": 2.8051319122314453, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18700101226568222, "step": 3918 }, { "epoch": 0.05851917664011229, "grad_norm": 0.384765625, "grad_norm_var": 0.0024703820546468098, "learning_rate": 0.0001, "loss": 1.3984, "loss/crossentropy": 2.3885180950164795, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.16010092943906784, "step": 3919 }, { "epoch": 0.05853410881072727, "grad_norm": 0.3828125, "grad_norm_var": 0.002491188049316406, "learning_rate": 0.0001, "loss": 1.511, "loss/crossentropy": 2.63584566116333, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1828843578696251, "step": 3920 }, { "epoch": 0.058549040981342254, "grad_norm": 0.380859375, "grad_norm_var": 0.002394851048787435, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.4344972372055054, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1746799200773239, "step": 3921 }, { "epoch": 0.05856397315195724, "grad_norm": 0.376953125, "grad_norm_var": 0.0024332046508789063, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.742316961288452, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.23733707517385483, "step": 3922 }, { "epoch": 0.05857890532257221, "grad_norm": 0.349609375, "grad_norm_var": 0.0010574181874593099, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.7323845624923706, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.17335227131843567, "step": 3923 }, { "epoch": 0.058593837493187195, "grad_norm": 0.34765625, "grad_norm_var": 0.000946044921875, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.5595463514328003, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17956571280956268, "step": 3924 }, { "epoch": 0.05860876966380218, "grad_norm": 0.365234375, "grad_norm_var": 0.0009353001912434896, "learning_rate": 0.0001, "loss": 1.5709, "loss/crossentropy": 2.574304699897766, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20374073088169098, "step": 3925 }, { "epoch": 0.05862370183441716, "grad_norm": 0.453125, "grad_norm_var": 0.0011235396067301431, "learning_rate": 0.0001, "loss": 1.7509, "loss/crossentropy": 2.6308183670043945, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.2352467179298401, "step": 3926 }, { "epoch": 0.05863863400503214, "grad_norm": 0.412109375, "grad_norm_var": 0.0011675516764322917, "learning_rate": 0.0001, "loss": 1.4791, "loss/crossentropy": 2.593815803527832, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1743936389684677, "step": 3927 }, { "epoch": 0.058653566175647126, "grad_norm": 0.423828125, "grad_norm_var": 0.0011284987131754557, "learning_rate": 0.0001, "loss": 1.6444, "loss/crossentropy": 2.6533411741256714, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.20295210927724838, "step": 3928 }, { "epoch": 0.0586684983462621, "grad_norm": 0.36328125, "grad_norm_var": 0.0011258284250895182, "learning_rate": 0.0001, "loss": 1.5, "loss/crossentropy": 2.814623713493347, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18745630979537964, "step": 3929 }, { "epoch": 0.058683430516877085, "grad_norm": 0.36328125, "grad_norm_var": 0.0008382161458333333, "learning_rate": 0.0001, "loss": 1.5248, "loss/crossentropy": 2.6334148645401, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.20446686446666718, "step": 3930 }, { "epoch": 0.05869836268749207, "grad_norm": 0.40234375, "grad_norm_var": 0.0008586724599202474, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.462403893470764, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1754559427499771, "step": 3931 }, { "epoch": 0.05871329485810705, "grad_norm": 0.44140625, "grad_norm_var": 0.0010326226552327474, "learning_rate": 0.0001, "loss": 1.9381, "loss/crossentropy": 2.8730289936065674, "loss/fcd": 1.6953125, "loss/idx": 9.0, "loss/logits": 0.24278737604618073, "step": 3932 }, { "epoch": 0.05872822702872203, "grad_norm": 0.59375, "grad_norm_var": 0.0036501407623291014, "learning_rate": 0.0001, "loss": 1.9008, "loss/crossentropy": 2.4757241010665894, "loss/fcd": 1.625, "loss/idx": 9.0, "loss/logits": 0.2758151441812515, "step": 3933 }, { "epoch": 0.05874315919933701, "grad_norm": 0.37890625, "grad_norm_var": 0.0035883426666259766, "learning_rate": 0.0001, "loss": 1.6638, "loss/crossentropy": 2.4826565980911255, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.23025336861610413, "step": 3934 }, { "epoch": 0.05875809136995199, "grad_norm": 0.408203125, "grad_norm_var": 0.003571176528930664, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.56171977519989, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.18253694474697113, "step": 3935 }, { "epoch": 0.058773023540566974, "grad_norm": 0.375, "grad_norm_var": 0.0035957177480061848, "learning_rate": 0.0001, "loss": 1.4567, "loss/crossentropy": 2.5764979124069214, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.17544958740472794, "step": 3936 }, { "epoch": 0.058787955711181956, "grad_norm": 0.400390625, "grad_norm_var": 0.0035639286041259767, "learning_rate": 0.0001, "loss": 1.5439, "loss/crossentropy": 2.3860684633255005, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.17672371119260788, "step": 3937 }, { "epoch": 0.05880288788179694, "grad_norm": 0.48828125, "grad_norm_var": 0.003945350646972656, "learning_rate": 0.0001, "loss": 1.8548, "loss/crossentropy": 2.790028691291809, "loss/fcd": 1.5859375, "loss/idx": 9.0, "loss/logits": 0.26885779201984406, "step": 3938 }, { "epoch": 0.05881782005241192, "grad_norm": 0.412109375, "grad_norm_var": 0.0036828994750976564, "learning_rate": 0.0001, "loss": 1.5019, "loss/crossentropy": 2.429647922515869, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.17375494539737701, "step": 3939 }, { "epoch": 0.0588327522230269, "grad_norm": 0.421875, "grad_norm_var": 0.00336761474609375, "learning_rate": 0.0001, "loss": 1.4102, "loss/crossentropy": 2.54573917388916, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.16415618360042572, "step": 3940 }, { "epoch": 0.05884768439364188, "grad_norm": 0.44140625, "grad_norm_var": 0.0031847476959228514, "learning_rate": 0.0001, "loss": 1.8331, "loss/crossentropy": 2.50403368473053, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.26670699566602707, "step": 3941 }, { "epoch": 0.05886261656425686, "grad_norm": 0.33203125, "grad_norm_var": 0.0036262353261311848, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.583983898162842, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.15883462131023407, "step": 3942 }, { "epoch": 0.058877548734871846, "grad_norm": 0.478515625, "grad_norm_var": 0.003866179784138997, "learning_rate": 0.0001, "loss": 1.5242, "loss/crossentropy": 2.5233267545700073, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18824837356805801, "step": 3943 }, { "epoch": 0.05889248090548683, "grad_norm": 0.412109375, "grad_norm_var": 0.0038692315419514975, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.616483211517334, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21640942245721817, "step": 3944 }, { "epoch": 0.058907413076101804, "grad_norm": 0.3984375, "grad_norm_var": 0.003682692845662435, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.5408653020858765, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18386495113372803, "step": 3945 }, { "epoch": 0.05892234524671679, "grad_norm": 0.4296875, "grad_norm_var": 0.0034405867258707683, "learning_rate": 0.0001, "loss": 1.6849, "loss/crossentropy": 2.4914915561676025, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.23564668744802475, "step": 3946 }, { "epoch": 0.05893727741733177, "grad_norm": 0.400390625, "grad_norm_var": 0.00344696044921875, "learning_rate": 0.0001, "loss": 1.6663, "loss/crossentropy": 2.646287679672241, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.23269697278738022, "step": 3947 }, { "epoch": 0.05895220958794675, "grad_norm": 0.361328125, "grad_norm_var": 0.0036809126536051433, "learning_rate": 0.0001, "loss": 1.5261, "loss/crossentropy": 2.6402257680892944, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.2057717815041542, "step": 3948 }, { "epoch": 0.058967141758561735, "grad_norm": 0.62890625, "grad_norm_var": 0.0045689741770426435, "learning_rate": 0.0001, "loss": 1.6316, "loss/crossentropy": 2.604387402534485, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.2214445248246193, "step": 3949 }, { "epoch": 0.05898207392917672, "grad_norm": 0.3984375, "grad_norm_var": 0.004478057225545247, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 2.4845879077911377, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.163669154047966, "step": 3950 }, { "epoch": 0.05899700609979169, "grad_norm": 0.416015625, "grad_norm_var": 0.004465214411417643, "learning_rate": 0.0001, "loss": 1.5282, "loss/crossentropy": 2.6957576274871826, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18055449426174164, "step": 3951 }, { "epoch": 0.059011938270406676, "grad_norm": 0.400390625, "grad_norm_var": 0.004337310791015625, "learning_rate": 0.0001, "loss": 1.5331, "loss/crossentropy": 2.5198936462402344, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.20102061331272125, "step": 3952 }, { "epoch": 0.05902687044102166, "grad_norm": 0.3828125, "grad_norm_var": 0.004417276382446289, "learning_rate": 0.0001, "loss": 1.6486, "loss/crossentropy": 2.607659935951233, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.23452119529247284, "step": 3953 }, { "epoch": 0.05904180261163664, "grad_norm": 0.388671875, "grad_norm_var": 0.00419921875, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.4870898723602295, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18351981043815613, "step": 3954 }, { "epoch": 0.059056734782251624, "grad_norm": 0.34375, "grad_norm_var": 0.004553588231404623, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.7227959632873535, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.2007230594754219, "step": 3955 }, { "epoch": 0.05907166695286661, "grad_norm": 0.373046875, "grad_norm_var": 0.004655710856119792, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.4741904735565186, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.1552594229578972, "step": 3956 }, { "epoch": 0.05908659912348158, "grad_norm": 0.3671875, "grad_norm_var": 0.004705238342285156, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.7029366493225098, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.2066010683774948, "step": 3957 }, { "epoch": 0.059101531294096565, "grad_norm": 0.427734375, "grad_norm_var": 0.00432127316792806, "learning_rate": 0.0001, "loss": 1.7971, "loss/crossentropy": 2.405835270881653, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.2658504918217659, "step": 3958 }, { "epoch": 0.05911646346471155, "grad_norm": 0.359375, "grad_norm_var": 0.00416711171468099, "learning_rate": 0.0001, "loss": 1.6576, "loss/crossentropy": 2.5292794704437256, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.22402888536453247, "step": 3959 }, { "epoch": 0.05913139563532653, "grad_norm": 0.404296875, "grad_norm_var": 0.004164059956868489, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.4386956691741943, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1796865463256836, "step": 3960 }, { "epoch": 0.05914632780594151, "grad_norm": 0.404296875, "grad_norm_var": 0.00416105588277181, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.57099187374115, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18247656524181366, "step": 3961 }, { "epoch": 0.05916125997655649, "grad_norm": 0.4296875, "grad_norm_var": 0.00416105588277181, "learning_rate": 0.0001, "loss": 1.6238, "loss/crossentropy": 2.552027702331543, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.22536169737577438, "step": 3962 }, { "epoch": 0.05917619214717147, "grad_norm": 0.341796875, "grad_norm_var": 0.0044147332509358725, "learning_rate": 0.0001, "loss": 1.5683, "loss/crossentropy": 2.499171495437622, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2089046984910965, "step": 3963 }, { "epoch": 0.059191124317786455, "grad_norm": 0.498046875, "grad_norm_var": 0.004846429824829102, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.6624897718429565, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2571970224380493, "step": 3964 }, { "epoch": 0.05920605648840144, "grad_norm": 0.33984375, "grad_norm_var": 0.001642465591430664, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.5300744771957397, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.21908967196941376, "step": 3965 }, { "epoch": 0.05922098865901642, "grad_norm": 0.359375, "grad_norm_var": 0.0017054080963134766, "learning_rate": 0.0001, "loss": 1.569, "loss/crossentropy": 2.437151312828064, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20181192457675934, "step": 3966 }, { "epoch": 0.0592359208296314, "grad_norm": 0.359375, "grad_norm_var": 0.0017077128092447917, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.575844168663025, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1892310306429863, "step": 3967 }, { "epoch": 0.05925085300024638, "grad_norm": 0.365234375, "grad_norm_var": 0.0017185846964518229, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.542594075202942, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1864328533411026, "step": 3968 }, { "epoch": 0.05926578517086136, "grad_norm": 0.359375, "grad_norm_var": 0.0017567316691080729, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.697471022605896, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.17748141288757324, "step": 3969 }, { "epoch": 0.059280717341476344, "grad_norm": 0.36328125, "grad_norm_var": 0.0017763614654541016, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.745514988899231, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18657537549734116, "step": 3970 }, { "epoch": 0.05929564951209133, "grad_norm": 0.36328125, "grad_norm_var": 0.0017032464345296224, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.772555112838745, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19030853360891342, "step": 3971 }, { "epoch": 0.05931058168270631, "grad_norm": 0.365234375, "grad_norm_var": 0.00171659787495931, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.322414994239807, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17529278248548508, "step": 3972 }, { "epoch": 0.059325513853321285, "grad_norm": 0.37109375, "grad_norm_var": 0.0017099857330322265, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.511973023414612, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.1869632452726364, "step": 3973 }, { "epoch": 0.05934044602393627, "grad_norm": 0.40234375, "grad_norm_var": 0.001595306396484375, "learning_rate": 0.0001, "loss": 1.5541, "loss/crossentropy": 2.4941686391830444, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18689095228910446, "step": 3974 }, { "epoch": 0.05935537819455125, "grad_norm": 0.640625, "grad_norm_var": 0.005751800537109375, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.5279555320739746, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.21650373190641403, "step": 3975 }, { "epoch": 0.05937031036516623, "grad_norm": 0.34375, "grad_norm_var": 0.00592967669169108, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.4987666606903076, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.1768038496375084, "step": 3976 }, { "epoch": 0.059385242535781216, "grad_norm": 0.4296875, "grad_norm_var": 0.00600426991780599, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.4692280292510986, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2095077633857727, "step": 3977 }, { "epoch": 0.0594001747063962, "grad_norm": 0.40234375, "grad_norm_var": 0.005927276611328125, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.5521785020828247, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.186623215675354, "step": 3978 }, { "epoch": 0.059415106877011174, "grad_norm": 0.46875, "grad_norm_var": 0.006050221125284831, "learning_rate": 0.0001, "loss": 1.619, "loss/crossentropy": 2.7475545406341553, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.21270616352558136, "step": 3979 }, { "epoch": 0.05943003904762616, "grad_norm": 0.435546875, "grad_norm_var": 0.005493783950805664, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.8654590845108032, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1918497532606125, "step": 3980 }, { "epoch": 0.05944497121824114, "grad_norm": 0.373046875, "grad_norm_var": 0.005304908752441407, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.667895197868347, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16963639855384827, "step": 3981 }, { "epoch": 0.05945990338885612, "grad_norm": 0.396484375, "grad_norm_var": 0.005189243952433268, "learning_rate": 0.0001, "loss": 1.6111, "loss/crossentropy": 2.544269323348999, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20097996294498444, "step": 3982 }, { "epoch": 0.059474835559471105, "grad_norm": 0.34375, "grad_norm_var": 0.005294275283813476, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.6529990434646606, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17970096319913864, "step": 3983 }, { "epoch": 0.05948976773008608, "grad_norm": 0.341796875, "grad_norm_var": 0.005441904067993164, "learning_rate": 0.0001, "loss": 1.5091, "loss/crossentropy": 2.687416434288025, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19274116307497025, "step": 3984 }, { "epoch": 0.059504699900701064, "grad_norm": 0.447265625, "grad_norm_var": 0.005448341369628906, "learning_rate": 0.0001, "loss": 1.674, "loss/crossentropy": 2.4107563495635986, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.209183469414711, "step": 3985 }, { "epoch": 0.059519632071316046, "grad_norm": 0.421875, "grad_norm_var": 0.00533294677734375, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.9239548444747925, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20986974239349365, "step": 3986 }, { "epoch": 0.05953456424193103, "grad_norm": 0.33203125, "grad_norm_var": 0.005585225423177084, "learning_rate": 0.0001, "loss": 1.4349, "loss/crossentropy": 2.264728546142578, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.16537919640541077, "step": 3987 }, { "epoch": 0.05954949641254601, "grad_norm": 0.359375, "grad_norm_var": 0.0056201775868733725, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 2.6259623765945435, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18213149905204773, "step": 3988 }, { "epoch": 0.059564428583160994, "grad_norm": 0.37109375, "grad_norm_var": 0.0056201775868733725, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.581473469734192, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18118131160736084, "step": 3989 }, { "epoch": 0.05957936075377597, "grad_norm": 0.396484375, "grad_norm_var": 0.0056258519490559895, "learning_rate": 0.0001, "loss": 1.5499, "loss/crossentropy": 2.793413996696472, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.21005398780107498, "step": 3990 }, { "epoch": 0.05959429292439095, "grad_norm": 0.333984375, "grad_norm_var": 0.001930093765258789, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.540606141090393, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.1626478135585785, "step": 3991 }, { "epoch": 0.059609225095005935, "grad_norm": 0.3984375, "grad_norm_var": 0.0017992496490478516, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.561323404312134, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.1922682821750641, "step": 3992 }, { "epoch": 0.05962415726562092, "grad_norm": 0.41015625, "grad_norm_var": 0.0017216841379801432, "learning_rate": 0.0001, "loss": 1.5986, "loss/crossentropy": 2.595049738883972, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20411132276058197, "step": 3993 }, { "epoch": 0.0596390894362359, "grad_norm": 0.373046875, "grad_norm_var": 0.0017252604166666666, "learning_rate": 0.0001, "loss": 1.6449, "loss/crossentropy": 2.2532848119735718, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2074195221066475, "step": 3994 }, { "epoch": 0.05965402160685088, "grad_norm": 0.40234375, "grad_norm_var": 0.0012832005818684896, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.6961759328842163, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1983640044927597, "step": 3995 }, { "epoch": 0.05966895377746586, "grad_norm": 0.33984375, "grad_norm_var": 0.0011920770009358724, "learning_rate": 0.0001, "loss": 1.5115, "loss/crossentropy": 2.62531578540802, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19895964115858078, "step": 3996 }, { "epoch": 0.05968388594808084, "grad_norm": 0.3671875, "grad_norm_var": 0.0011977513631184896, "learning_rate": 0.0001, "loss": 1.557, "loss/crossentropy": 2.661519408226013, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19762397557497025, "step": 3997 }, { "epoch": 0.059698818118695825, "grad_norm": 0.361328125, "grad_norm_var": 0.0011845906575520834, "learning_rate": 0.0001, "loss": 1.5291, "loss/crossentropy": 2.784752607345581, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20096470415592194, "step": 3998 }, { "epoch": 0.05971375028931081, "grad_norm": 0.7578125, "grad_norm_var": 0.010174814860026042, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.557211995124817, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.25338931381702423, "step": 3999 }, { "epoch": 0.05972868245992579, "grad_norm": 0.33984375, "grad_norm_var": 0.010190439224243165, "learning_rate": 0.0001, "loss": 1.5343, "loss/crossentropy": 2.5442386865615845, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19833143055438995, "step": 4000 }, { "epoch": 0.059743614630540766, "grad_norm": 0.408203125, "grad_norm_var": 0.010043573379516602, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.7850793600082397, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1743115559220314, "step": 4001 }, { "epoch": 0.05975854680115575, "grad_norm": 0.412109375, "grad_norm_var": 0.010018857320149739, "learning_rate": 0.0001, "loss": 1.6412, "loss/crossentropy": 2.799274206161499, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.21151474118232727, "step": 4002 }, { "epoch": 0.05977347897177073, "grad_norm": 0.40625, "grad_norm_var": 0.009713236490885417, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.588519811630249, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17379430681467056, "step": 4003 }, { "epoch": 0.059788411142385714, "grad_norm": 0.95703125, "grad_norm_var": 0.028613726298014324, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.5910991430282593, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.21312463283538818, "step": 4004 }, { "epoch": 0.0598033433130007, "grad_norm": 0.423828125, "grad_norm_var": 0.02830516497294108, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.512299060821533, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.22792838513851166, "step": 4005 }, { "epoch": 0.05981827548361567, "grad_norm": 0.396484375, "grad_norm_var": 0.02830516497294108, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.7195088863372803, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17859376221895218, "step": 4006 }, { "epoch": 0.059833207654230655, "grad_norm": 0.388671875, "grad_norm_var": 0.027697229385375978, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.6595805883407593, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19618721306324005, "step": 4007 }, { "epoch": 0.05984813982484564, "grad_norm": 0.3671875, "grad_norm_var": 0.027958154678344727, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.5878041982650757, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.19718912988901138, "step": 4008 }, { "epoch": 0.05986307199546062, "grad_norm": 0.40234375, "grad_norm_var": 0.027997700373331706, "learning_rate": 0.0001, "loss": 1.558, "loss/crossentropy": 2.61835515499115, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19473712146282196, "step": 4009 }, { "epoch": 0.0598780041660756, "grad_norm": 0.384765625, "grad_norm_var": 0.027895466486612955, "learning_rate": 0.0001, "loss": 1.5637, "loss/crossentropy": 2.5350440740585327, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20429791510105133, "step": 4010 }, { "epoch": 0.059892936336690586, "grad_norm": 0.376953125, "grad_norm_var": 0.02807916005452474, "learning_rate": 0.0001, "loss": 1.682, "loss/crossentropy": 2.4871041774749756, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.24840883910655975, "step": 4011 }, { "epoch": 0.05990786850730556, "grad_norm": 0.359375, "grad_norm_var": 0.027834065755208335, "learning_rate": 0.0001, "loss": 1.63, "loss/crossentropy": 2.4894098043441772, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.2159746289253235, "step": 4012 }, { "epoch": 0.059922800677920544, "grad_norm": 0.416015625, "grad_norm_var": 0.02748080889383952, "learning_rate": 0.0001, "loss": 1.6493, "loss/crossentropy": 2.532664179801941, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2157217264175415, "step": 4013 }, { "epoch": 0.05993773284853553, "grad_norm": 0.421875, "grad_norm_var": 0.027015177408854167, "learning_rate": 0.0001, "loss": 1.57, "loss/crossentropy": 2.4368048906326294, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.1715148687362671, "step": 4014 }, { "epoch": 0.05995266501915051, "grad_norm": 0.373046875, "grad_norm_var": 0.02053666114807129, "learning_rate": 0.0001, "loss": 1.4987, "loss/crossentropy": 2.572019577026367, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19012603163719177, "step": 4015 }, { "epoch": 0.05996759718976549, "grad_norm": 0.373046875, "grad_norm_var": 0.020219167073567707, "learning_rate": 0.0001, "loss": 1.4627, "loss/crossentropy": 2.7683348655700684, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18532554805278778, "step": 4016 }, { "epoch": 0.059982529360380475, "grad_norm": 0.380859375, "grad_norm_var": 0.020342445373535155, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.915337562561035, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20207060873508453, "step": 4017 }, { "epoch": 0.05999746153099545, "grad_norm": 0.451171875, "grad_norm_var": 0.020357704162597655, "learning_rate": 0.0001, "loss": 1.9249, "loss/crossentropy": 2.616655468940735, "loss/fcd": 1.66796875, "loss/idx": 9.0, "loss/logits": 0.25690100342035294, "step": 4018 }, { "epoch": 0.060012393701610434, "grad_norm": 0.353515625, "grad_norm_var": 0.020698022842407227, "learning_rate": 0.0001, "loss": 1.7613, "loss/crossentropy": 2.4389408826828003, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.26134316623210907, "step": 4019 }, { "epoch": 0.060027325872225416, "grad_norm": 0.357421875, "grad_norm_var": 0.0007647196451822917, "learning_rate": 0.0001, "loss": 1.4394, "loss/crossentropy": 2.5540891885757446, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.16200730204582214, "step": 4020 }, { "epoch": 0.0600422580428404, "grad_norm": 0.443359375, "grad_norm_var": 0.0008788426717122396, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.3907217979431152, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.16135117411613464, "step": 4021 }, { "epoch": 0.06005719021345538, "grad_norm": 0.349609375, "grad_norm_var": 0.0009780248006184896, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.637732982635498, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.1783207729458809, "step": 4022 }, { "epoch": 0.06007212238407036, "grad_norm": 0.34375, "grad_norm_var": 0.0010968367258707681, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.6149617433547974, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17984077334403992, "step": 4023 }, { "epoch": 0.06008705455468534, "grad_norm": 0.412109375, "grad_norm_var": 0.0011184056599934896, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.6854859590530396, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.210180401802063, "step": 4024 }, { "epoch": 0.06010198672530032, "grad_norm": 0.36328125, "grad_norm_var": 0.0011362075805664063, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.6919792890548706, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19533725827932358, "step": 4025 }, { "epoch": 0.060116918895915306, "grad_norm": 0.359375, "grad_norm_var": 0.0011773268381754557, "learning_rate": 0.0001, "loss": 1.5419, "loss/crossentropy": 2.633792757987976, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19028809666633606, "step": 4026 }, { "epoch": 0.06013185106653029, "grad_norm": 0.41796875, "grad_norm_var": 0.001247088114420573, "learning_rate": 0.0001, "loss": 1.5525, "loss/crossentropy": 2.7568392753601074, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1930968090891838, "step": 4027 }, { "epoch": 0.06014678323714527, "grad_norm": 0.3671875, "grad_norm_var": 0.0012231826782226562, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.463181257247925, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19359684735536575, "step": 4028 }, { "epoch": 0.06016171540776025, "grad_norm": 0.380859375, "grad_norm_var": 0.001161956787109375, "learning_rate": 0.0001, "loss": 1.4937, "loss/crossentropy": 2.5316061973571777, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18900883197784424, "step": 4029 }, { "epoch": 0.06017664757837523, "grad_norm": 0.330078125, "grad_norm_var": 0.0012284437815348306, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.540556788444519, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.22062642872333527, "step": 4030 }, { "epoch": 0.06019157974899021, "grad_norm": 0.365234375, "grad_norm_var": 0.0012379805246988931, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.705519914627075, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1910841390490532, "step": 4031 }, { "epoch": 0.060206511919605195, "grad_norm": 0.470703125, "grad_norm_var": 0.0017688592274983724, "learning_rate": 0.0001, "loss": 1.889, "loss/crossentropy": 2.2127660512924194, "loss/fcd": 1.625, "loss/idx": 9.0, "loss/logits": 0.2640167102217674, "step": 4032 }, { "epoch": 0.06022144409022018, "grad_norm": 0.359375, "grad_norm_var": 0.0018071492513020834, "learning_rate": 0.0001, "loss": 1.5406, "loss/crossentropy": 2.6700481176376343, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.21251235902309418, "step": 4033 }, { "epoch": 0.06023637626083515, "grad_norm": 0.328125, "grad_norm_var": 0.001631911595662435, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.400204658508301, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.1564297080039978, "step": 4034 }, { "epoch": 0.060251308431450136, "grad_norm": 0.396484375, "grad_norm_var": 0.00162351926167806, "learning_rate": 0.0001, "loss": 1.5101, "loss/crossentropy": 2.5231637954711914, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.185927614569664, "step": 4035 }, { "epoch": 0.06026624060206512, "grad_norm": 0.376953125, "grad_norm_var": 0.0015942732493082683, "learning_rate": 0.0001, "loss": 1.624, "loss/crossentropy": 2.7593353986740112, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.2216474413871765, "step": 4036 }, { "epoch": 0.0602811727726801, "grad_norm": 0.3671875, "grad_norm_var": 0.0013035456339518228, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.6521430015563965, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18401145190000534, "step": 4037 }, { "epoch": 0.060296104943295084, "grad_norm": 0.34375, "grad_norm_var": 0.0013249556223551432, "learning_rate": 0.0001, "loss": 1.584, "loss/crossentropy": 2.45623242855072, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.22072632610797882, "step": 4038 }, { "epoch": 0.06031103711391007, "grad_norm": 0.458984375, "grad_norm_var": 0.0016916275024414062, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.615217685699463, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1618005931377411, "step": 4039 }, { "epoch": 0.06032596928452504, "grad_norm": 0.42578125, "grad_norm_var": 0.0017598311106363932, "learning_rate": 0.0001, "loss": 1.6434, "loss/crossentropy": 2.1958866119384766, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.19026019424200058, "step": 4040 }, { "epoch": 0.060340901455140025, "grad_norm": 0.46484375, "grad_norm_var": 0.002151600519816081, "learning_rate": 0.0001, "loss": 1.7881, "loss/crossentropy": 2.671264886856079, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.25290368497371674, "step": 4041 }, { "epoch": 0.06035583362575501, "grad_norm": 0.37109375, "grad_norm_var": 0.0021149794260660808, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.7238343954086304, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16625387221574783, "step": 4042 }, { "epoch": 0.06037076579636999, "grad_norm": 0.326171875, "grad_norm_var": 0.0022875467936197915, "learning_rate": 0.0001, "loss": 1.3927, "loss/crossentropy": 2.8006216287612915, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.17784865200519562, "step": 4043 }, { "epoch": 0.06038569796698497, "grad_norm": 0.33203125, "grad_norm_var": 0.002440325419108073, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.574357032775879, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.19825482368469238, "step": 4044 }, { "epoch": 0.06040063013759995, "grad_norm": 0.404296875, "grad_norm_var": 0.002473894755045573, "learning_rate": 0.0001, "loss": 1.5817, "loss/crossentropy": 2.459779739379883, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.19502703845500946, "step": 4045 }, { "epoch": 0.06041556230821493, "grad_norm": 0.416015625, "grad_norm_var": 0.0023340225219726563, "learning_rate": 0.0001, "loss": 1.5941, "loss/crossentropy": 2.5986061096191406, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.20741916447877884, "step": 4046 }, { "epoch": 0.060430494478829914, "grad_norm": 0.333984375, "grad_norm_var": 0.002489662170410156, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.60748553276062, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17426716536283493, "step": 4047 }, { "epoch": 0.0604454266494449, "grad_norm": 0.51953125, "grad_norm_var": 0.0031902154286702475, "learning_rate": 0.0001, "loss": 1.767, "loss/crossentropy": 2.3736441135406494, "loss/fcd": 1.55078125, "loss/idx": 9.0, "loss/logits": 0.2162594124674797, "step": 4048 }, { "epoch": 0.06046035882005988, "grad_norm": 0.431640625, "grad_norm_var": 0.003230794270833333, "learning_rate": 0.0001, "loss": 1.6175, "loss/crossentropy": 2.482070565223694, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.22686432301998138, "step": 4049 }, { "epoch": 0.06047529099067486, "grad_norm": 0.408203125, "grad_norm_var": 0.0029329776763916014, "learning_rate": 0.0001, "loss": 1.653, "loss/crossentropy": 2.253244638442993, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.20764288306236267, "step": 4050 }, { "epoch": 0.06049022316128984, "grad_norm": 0.46484375, "grad_norm_var": 0.0032061258951822918, "learning_rate": 0.0001, "loss": 1.5301, "loss/crossentropy": 2.434836745262146, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.17856667935848236, "step": 4051 }, { "epoch": 0.06050515533190482, "grad_norm": 0.322265625, "grad_norm_var": 0.003581746419270833, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 2.545537829399109, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17440716922283173, "step": 4052 }, { "epoch": 0.060520087502519804, "grad_norm": 0.388671875, "grad_norm_var": 0.0035182793935139975, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.8264684677124023, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21859118342399597, "step": 4053 }, { "epoch": 0.060535019673134786, "grad_norm": 0.306640625, "grad_norm_var": 0.00388641357421875, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.488881826400757, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.21558251976966858, "step": 4054 }, { "epoch": 0.06054995184374977, "grad_norm": 0.36328125, "grad_norm_var": 0.003686253229777018, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.6237717866897583, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17477117478847504, "step": 4055 }, { "epoch": 0.060564884014364745, "grad_norm": 0.451171875, "grad_norm_var": 0.0038393656412760418, "learning_rate": 0.0001, "loss": 1.7019, "loss/crossentropy": 2.49915087223053, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2370549589395523, "step": 4056 }, { "epoch": 0.06057981618497973, "grad_norm": 0.400390625, "grad_norm_var": 0.003490559260050456, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.6837812662124634, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18146122992038727, "step": 4057 }, { "epoch": 0.06059474835559471, "grad_norm": 0.3671875, "grad_norm_var": 0.0035013675689697264, "learning_rate": 0.0001, "loss": 1.4488, "loss/crossentropy": 2.4703017473220825, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17536019533872604, "step": 4058 }, { "epoch": 0.06060968052620969, "grad_norm": 0.392578125, "grad_norm_var": 0.003213866551717122, "learning_rate": 0.0001, "loss": 1.497, "loss/crossentropy": 2.5554224252700806, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19235306233167648, "step": 4059 }, { "epoch": 0.060624612696824676, "grad_norm": 0.5859375, "grad_norm_var": 0.005147918065388998, "learning_rate": 0.0001, "loss": 2.6916, "loss/crossentropy": 2.594683051109314, "loss/fcd": 2.2890625, "loss/idx": 9.0, "loss/logits": 0.4025867134332657, "step": 4060 }, { "epoch": 0.06063954486743966, "grad_norm": 0.373046875, "grad_norm_var": 0.005231841405232748, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.5890448093414307, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19730211794376373, "step": 4061 }, { "epoch": 0.060654477038054634, "grad_norm": 0.376953125, "grad_norm_var": 0.005284611384073893, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.7080774307250977, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.17376524955034256, "step": 4062 }, { "epoch": 0.06066940920866962, "grad_norm": 0.4296875, "grad_norm_var": 0.004945818583170573, "learning_rate": 0.0001, "loss": 1.6876, "loss/crossentropy": 2.8351014852523804, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.23057088255882263, "step": 4063 }, { "epoch": 0.0606843413792846, "grad_norm": 0.421875, "grad_norm_var": 0.00413360595703125, "learning_rate": 0.0001, "loss": 1.5144, "loss/crossentropy": 2.4663922786712646, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.2018662840127945, "step": 4064 }, { "epoch": 0.06069927354989958, "grad_norm": 0.4296875, "grad_norm_var": 0.004126977920532226, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.5285013914108276, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18339277058839798, "step": 4065 }, { "epoch": 0.060714205720514565, "grad_norm": 0.388671875, "grad_norm_var": 0.004142872492472331, "learning_rate": 0.0001, "loss": 1.5466, "loss/crossentropy": 2.460532784461975, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.19506916403770447, "step": 4066 }, { "epoch": 0.06072913789112954, "grad_norm": 0.388671875, "grad_norm_var": 0.0038868586222330728, "learning_rate": 0.0001, "loss": 1.582, "loss/crossentropy": 2.7051910161972046, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20702971518039703, "step": 4067 }, { "epoch": 0.06074407006174452, "grad_norm": 0.373046875, "grad_norm_var": 0.0035273234049479165, "learning_rate": 0.0001, "loss": 1.5304, "loss/crossentropy": 2.4663058519363403, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19058635830879211, "step": 4068 }, { "epoch": 0.060759002232359506, "grad_norm": 0.37109375, "grad_norm_var": 0.0035786787668863934, "learning_rate": 0.0001, "loss": 1.5746, "loss/crossentropy": 2.568427324295044, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20742948353290558, "step": 4069 }, { "epoch": 0.06077393440297449, "grad_norm": 0.4375, "grad_norm_var": 0.0029982884724934896, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.7094037532806396, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.22985589504241943, "step": 4070 }, { "epoch": 0.06078886657358947, "grad_norm": 0.384765625, "grad_norm_var": 0.0028949578603108725, "learning_rate": 0.0001, "loss": 1.7039, "loss/crossentropy": 2.4266016483306885, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.2508121281862259, "step": 4071 }, { "epoch": 0.060803798744204454, "grad_norm": 0.5234375, "grad_norm_var": 0.003610674540201823, "learning_rate": 0.0001, "loss": 1.859, "loss/crossentropy": 2.5562597513198853, "loss/fcd": 1.6015625, "loss/idx": 9.0, "loss/logits": 0.25741027295589447, "step": 4072 }, { "epoch": 0.06081873091481943, "grad_norm": 0.373046875, "grad_norm_var": 0.003711700439453125, "learning_rate": 0.0001, "loss": 1.4989, "loss/crossentropy": 2.5632734298706055, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18248603492975235, "step": 4073 }, { "epoch": 0.06083366308543441, "grad_norm": 0.435546875, "grad_norm_var": 0.003580967585245768, "learning_rate": 0.0001, "loss": 1.6984, "loss/crossentropy": 3.028182625770569, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.22967668622732162, "step": 4074 }, { "epoch": 0.060848595256049395, "grad_norm": 0.349609375, "grad_norm_var": 0.0038411299387613933, "learning_rate": 0.0001, "loss": 1.4483, "loss/crossentropy": 2.887086868286133, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1788025200366974, "step": 4075 }, { "epoch": 0.06086352742666438, "grad_norm": 0.37109375, "grad_norm_var": 0.001833963394165039, "learning_rate": 0.0001, "loss": 1.6692, "loss/crossentropy": 2.577728271484375, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.23168914020061493, "step": 4076 }, { "epoch": 0.06087845959727936, "grad_norm": 0.515625, "grad_norm_var": 0.0025591532389322917, "learning_rate": 0.0001, "loss": 2.0917, "loss/crossentropy": 2.823411703109741, "loss/fcd": 1.7109375, "loss/idx": 9.0, "loss/logits": 0.3807218670845032, "step": 4077 }, { "epoch": 0.060893391767894336, "grad_norm": 0.373046875, "grad_norm_var": 0.002577654520670573, "learning_rate": 0.0001, "loss": 1.4697, "loss/crossentropy": 2.7214680910110474, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18067865073680878, "step": 4078 }, { "epoch": 0.06090832393850932, "grad_norm": 0.37890625, "grad_norm_var": 0.0026082356770833334, "learning_rate": 0.0001, "loss": 1.5467, "loss/crossentropy": 2.5675458908081055, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.19125165790319443, "step": 4079 }, { "epoch": 0.0609232561091243, "grad_norm": 0.3515625, "grad_norm_var": 0.0027798970540364584, "learning_rate": 0.0001, "loss": 1.6949, "loss/crossentropy": 2.5938953161239624, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.25744572281837463, "step": 4080 }, { "epoch": 0.060938188279739285, "grad_norm": 0.37109375, "grad_norm_var": 0.0027846654256184896, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.4431434869766235, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.23462560772895813, "step": 4081 }, { "epoch": 0.06095312045035427, "grad_norm": 0.373046875, "grad_norm_var": 0.002821795145670573, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.819928288459778, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1871878057718277, "step": 4082 }, { "epoch": 0.06096805262096925, "grad_norm": 0.421875, "grad_norm_var": 0.002848545710245768, "learning_rate": 0.0001, "loss": 1.819, "loss/crossentropy": 2.152781367301941, "loss/fcd": 1.5859375, "loss/idx": 9.0, "loss/logits": 0.23305538296699524, "step": 4083 }, { "epoch": 0.060982984791584226, "grad_norm": 0.4453125, "grad_norm_var": 0.0029126485188802082, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.72503399848938, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18528994917869568, "step": 4084 }, { "epoch": 0.06099791696219921, "grad_norm": 0.37890625, "grad_norm_var": 0.0028813680013020835, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.803925633430481, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.22817204892635345, "step": 4085 }, { "epoch": 0.06101284913281419, "grad_norm": 0.388671875, "grad_norm_var": 0.002820571263631185, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.7997753620147705, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18102473765611649, "step": 4086 }, { "epoch": 0.061027781303429174, "grad_norm": 0.4296875, "grad_norm_var": 0.002842140197753906, "learning_rate": 0.0001, "loss": 1.6328, "loss/crossentropy": 2.3447338342666626, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.23438653349876404, "step": 4087 }, { "epoch": 0.06104271347404416, "grad_norm": 0.44140625, "grad_norm_var": 0.001967620849609375, "learning_rate": 0.0001, "loss": 1.6744, "loss/crossentropy": 2.636687994003296, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.252489410340786, "step": 4088 }, { "epoch": 0.06105764564465914, "grad_norm": 0.38671875, "grad_norm_var": 0.0019303480784098306, "learning_rate": 0.0001, "loss": 1.4938, "loss/crossentropy": 2.813384771347046, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18915162235498428, "step": 4089 }, { "epoch": 0.061072577815274115, "grad_norm": 0.34765625, "grad_norm_var": 0.002005449930826823, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.5980793237686157, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17013975232839584, "step": 4090 }, { "epoch": 0.0610875099858891, "grad_norm": 0.349609375, "grad_norm_var": 0.002005449930826823, "learning_rate": 0.0001, "loss": 1.5152, "loss/crossentropy": 2.7412649393081665, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19102120399475098, "step": 4091 }, { "epoch": 0.06110244215650408, "grad_norm": 0.427734375, "grad_norm_var": 0.0020234266916910808, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.3805906772613525, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.1946481242775917, "step": 4092 }, { "epoch": 0.06111737432711906, "grad_norm": 0.369140625, "grad_norm_var": 0.0010828653971354167, "learning_rate": 0.0001, "loss": 1.5248, "loss/crossentropy": 2.510832905769348, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19280275702476501, "step": 4093 }, { "epoch": 0.061132306497734046, "grad_norm": 0.4296875, "grad_norm_var": 0.001157999038696289, "learning_rate": 0.0001, "loss": 1.6042, "loss/crossentropy": 2.617435574531555, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.22921040654182434, "step": 4094 }, { "epoch": 0.06114723866834902, "grad_norm": 0.439453125, "grad_norm_var": 0.001271820068359375, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.9441750049591064, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19981181621551514, "step": 4095 }, { "epoch": 0.061162170838964004, "grad_norm": 0.37109375, "grad_norm_var": 0.0011774063110351562, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.504625916481018, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1764182597398758, "step": 4096 }, { "epoch": 0.06117710300957899, "grad_norm": 0.439453125, "grad_norm_var": 0.0012224674224853515, "learning_rate": 0.0001, "loss": 1.5729, "loss/crossentropy": 2.414254307746887, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20960631221532822, "step": 4097 }, { "epoch": 0.06119203518019397, "grad_norm": 0.396484375, "grad_norm_var": 0.001164865493774414, "learning_rate": 0.0001, "loss": 1.4469, "loss/crossentropy": 2.5569030046463013, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1734226495027542, "step": 4098 }, { "epoch": 0.06120696735080895, "grad_norm": 0.40234375, "grad_norm_var": 0.001141977310180664, "learning_rate": 0.0001, "loss": 1.6167, "loss/crossentropy": 2.8191369771957397, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.1986907348036766, "step": 4099 }, { "epoch": 0.061221899521423935, "grad_norm": 0.376953125, "grad_norm_var": 0.0010457356770833333, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.679119348526001, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1882714405655861, "step": 4100 }, { "epoch": 0.06123683169203891, "grad_norm": 0.390625, "grad_norm_var": 0.0010238011678059896, "learning_rate": 0.0001, "loss": 1.5718, "loss/crossentropy": 2.5730100870132446, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2046567127108574, "step": 4101 }, { "epoch": 0.061251763862653893, "grad_norm": 0.390625, "grad_norm_var": 0.0010213057200113933, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.6793601512908936, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19948744773864746, "step": 4102 }, { "epoch": 0.061266696033268876, "grad_norm": 0.357421875, "grad_norm_var": 0.0010548273722330729, "learning_rate": 0.0001, "loss": 1.6029, "loss/crossentropy": 2.5808109045028687, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.2162030041217804, "step": 4103 }, { "epoch": 0.06128162820388386, "grad_norm": 0.3828125, "grad_norm_var": 0.0009051005045572917, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.5440412759780884, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18241487443447113, "step": 4104 }, { "epoch": 0.06129656037449884, "grad_norm": 0.326171875, "grad_norm_var": 0.0011696974436442057, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.5810130834579468, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.18169349431991577, "step": 4105 }, { "epoch": 0.06131149254511382, "grad_norm": 0.37109375, "grad_norm_var": 0.0010800520579020182, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.745807409286499, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1864345222711563, "step": 4106 }, { "epoch": 0.0613264247157288, "grad_norm": 0.35546875, "grad_norm_var": 0.0010515848795572917, "learning_rate": 0.0001, "loss": 1.521, "loss/crossentropy": 2.4473992586135864, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19673824310302734, "step": 4107 }, { "epoch": 0.06134135688634378, "grad_norm": 0.33984375, "grad_norm_var": 0.0010823408762613933, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.4975948333740234, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.1684490442276001, "step": 4108 }, { "epoch": 0.061356289056958765, "grad_norm": 0.361328125, "grad_norm_var": 0.0011012872060139974, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.6625843048095703, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1710946336388588, "step": 4109 }, { "epoch": 0.06137122122757375, "grad_norm": 0.359375, "grad_norm_var": 0.0009742577870686849, "learning_rate": 0.0001, "loss": 1.4394, "loss/crossentropy": 2.7793816328048706, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17376024276018143, "step": 4110 }, { "epoch": 0.06138615339818873, "grad_norm": 0.32421875, "grad_norm_var": 0.000872039794921875, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.616391658782959, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20509783923625946, "step": 4111 }, { "epoch": 0.06140108556880371, "grad_norm": 0.412109375, "grad_norm_var": 0.0009745121002197266, "learning_rate": 0.0001, "loss": 1.5352, "loss/crossentropy": 2.4979225397109985, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18359794467687607, "step": 4112 }, { "epoch": 0.06141601773941869, "grad_norm": 0.38671875, "grad_norm_var": 0.0006891250610351563, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.449182629585266, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.1982349529862404, "step": 4113 }, { "epoch": 0.06143094991003367, "grad_norm": 0.3046875, "grad_norm_var": 0.0009020328521728516, "learning_rate": 0.0001, "loss": 1.4321, "loss/crossentropy": 2.551263928413391, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1781948208808899, "step": 4114 }, { "epoch": 0.061445882080648655, "grad_norm": 0.375, "grad_norm_var": 0.0008130232493082682, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.6224470138549805, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.18871259689331055, "step": 4115 }, { "epoch": 0.06146081425126364, "grad_norm": 0.361328125, "grad_norm_var": 0.0008000532786051432, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 2.498465895652771, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2011817768216133, "step": 4116 }, { "epoch": 0.06147574642187861, "grad_norm": 0.33203125, "grad_norm_var": 0.0007943312327067057, "learning_rate": 0.0001, "loss": 1.4864, "loss/crossentropy": 2.810651659965515, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18950074911117554, "step": 4117 }, { "epoch": 0.061490678592493596, "grad_norm": 0.31640625, "grad_norm_var": 0.0008233229319254558, "learning_rate": 0.0001, "loss": 1.4042, "loss/crossentropy": 2.665892481803894, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.16978049278259277, "step": 4118 }, { "epoch": 0.06150561076310858, "grad_norm": 0.3828125, "grad_norm_var": 0.0008747736612955729, "learning_rate": 0.0001, "loss": 1.5928, "loss/crossentropy": 2.6005897521972656, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20221549272537231, "step": 4119 }, { "epoch": 0.06152054293372356, "grad_norm": 0.396484375, "grad_norm_var": 0.0009358565012613933, "learning_rate": 0.0001, "loss": 1.6485, "loss/crossentropy": 2.2661960124969482, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2187732756137848, "step": 4120 }, { "epoch": 0.061535475104338544, "grad_norm": 0.419921875, "grad_norm_var": 0.0011052290598551433, "learning_rate": 0.0001, "loss": 1.629, "loss/crossentropy": 2.6182289123535156, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2110668197274208, "step": 4121 }, { "epoch": 0.06155040727495353, "grad_norm": 0.439453125, "grad_norm_var": 0.001476287841796875, "learning_rate": 0.0001, "loss": 1.8091, "loss/crossentropy": 2.3036341667175293, "loss/fcd": 1.578125, "loss/idx": 9.0, "loss/logits": 0.23093612492084503, "step": 4122 }, { "epoch": 0.0615653394455685, "grad_norm": 0.369140625, "grad_norm_var": 0.0014674981435139974, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.5604562759399414, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18824435770511627, "step": 4123 }, { "epoch": 0.061580271616183485, "grad_norm": 0.375, "grad_norm_var": 0.0014148553212483723, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.621224880218506, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.2155996412038803, "step": 4124 }, { "epoch": 0.06159520378679847, "grad_norm": 0.61328125, "grad_norm_var": 0.0050994237263997395, "learning_rate": 0.0001, "loss": 1.7376, "loss/crossentropy": 2.167931079864502, "loss/fcd": 1.5390625, "loss/idx": 9.0, "loss/logits": 0.19849632680416107, "step": 4125 }, { "epoch": 0.06161013595741345, "grad_norm": 0.40234375, "grad_norm_var": 0.005065155029296875, "learning_rate": 0.0001, "loss": 1.5295, "loss/crossentropy": 2.66765558719635, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20138880610466003, "step": 4126 }, { "epoch": 0.06162506812802843, "grad_norm": 0.357421875, "grad_norm_var": 0.0048508803049723305, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.6065233945846558, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16996175050735474, "step": 4127 }, { "epoch": 0.06164000029864341, "grad_norm": 0.43359375, "grad_norm_var": 0.00494232177734375, "learning_rate": 0.0001, "loss": 1.6685, "loss/crossentropy": 2.451615333557129, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.22318579256534576, "step": 4128 }, { "epoch": 0.06165493246925839, "grad_norm": 0.337890625, "grad_norm_var": 0.005123122533162435, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.6467740535736084, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17226414382457733, "step": 4129 }, { "epoch": 0.061669864639873374, "grad_norm": 0.3671875, "grad_norm_var": 0.004668410619099935, "learning_rate": 0.0001, "loss": 1.4504, "loss/crossentropy": 2.679363250732422, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17700833082199097, "step": 4130 }, { "epoch": 0.06168479681048836, "grad_norm": 0.47265625, "grad_norm_var": 0.005037164688110352, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.7100239992141724, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16227269172668457, "step": 4131 }, { "epoch": 0.06169972898110334, "grad_norm": 0.34375, "grad_norm_var": 0.00514373779296875, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.4439231157302856, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18673913925886154, "step": 4132 }, { "epoch": 0.06171466115171832, "grad_norm": 0.34765625, "grad_norm_var": 0.005022684733072917, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.706702470779419, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18236582726240158, "step": 4133 }, { "epoch": 0.0617295933223333, "grad_norm": 0.37109375, "grad_norm_var": 0.004611460367838541, "learning_rate": 0.0001, "loss": 1.5635, "loss/crossentropy": 2.79267156124115, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.21584157645702362, "step": 4134 }, { "epoch": 0.06174452549294828, "grad_norm": 0.3359375, "grad_norm_var": 0.0048678080240885414, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.6327672004699707, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18243687599897385, "step": 4135 }, { "epoch": 0.061759457663563264, "grad_norm": 0.3359375, "grad_norm_var": 0.0051166375478108725, "learning_rate": 0.0001, "loss": 1.4463, "loss/crossentropy": 2.625819206237793, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.18067938089370728, "step": 4136 }, { "epoch": 0.061774389834178246, "grad_norm": 0.3828125, "grad_norm_var": 0.005080095926920573, "learning_rate": 0.0001, "loss": 1.4302, "loss/crossentropy": 2.615450143814087, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1645539104938507, "step": 4137 }, { "epoch": 0.06178932200479323, "grad_norm": 0.392578125, "grad_norm_var": 0.004925982157389323, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.3705813884735107, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20696303993463516, "step": 4138 }, { "epoch": 0.061804254175408205, "grad_norm": 0.388671875, "grad_norm_var": 0.004895782470703125, "learning_rate": 0.0001, "loss": 1.6409, "loss/crossentropy": 2.4882344007492065, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.22680891305208206, "step": 4139 }, { "epoch": 0.06181918634602319, "grad_norm": 0.3984375, "grad_norm_var": 0.0048797607421875, "learning_rate": 0.0001, "loss": 1.579, "loss/crossentropy": 2.3684297800064087, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.2000628560781479, "step": 4140 }, { "epoch": 0.06183411851663817, "grad_norm": 0.42578125, "grad_norm_var": 0.0015594482421875, "learning_rate": 0.0001, "loss": 1.6742, "loss/crossentropy": 2.825570583343506, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2327881157398224, "step": 4141 }, { "epoch": 0.06184905068725315, "grad_norm": 0.40625, "grad_norm_var": 0.0015715916951497396, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.571076512336731, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17018111050128937, "step": 4142 }, { "epoch": 0.061863982857868136, "grad_norm": 0.357421875, "grad_norm_var": 0.0015715916951497396, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.4505945444107056, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1775602400302887, "step": 4143 }, { "epoch": 0.06187891502848312, "grad_norm": 0.353515625, "grad_norm_var": 0.0014119307200113932, "learning_rate": 0.0001, "loss": 1.4132, "loss/crossentropy": 2.566299319267273, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.17099225521087646, "step": 4144 }, { "epoch": 0.061893847199098094, "grad_norm": 0.359375, "grad_norm_var": 0.001331329345703125, "learning_rate": 0.0001, "loss": 1.3782, "loss/crossentropy": 2.6248300075531006, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.15948551893234253, "step": 4145 }, { "epoch": 0.06190877936971308, "grad_norm": 0.41015625, "grad_norm_var": 0.0013879776000976563, "learning_rate": 0.0001, "loss": 1.5991, "loss/crossentropy": 2.5344157218933105, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.19679349660873413, "step": 4146 }, { "epoch": 0.06192371154032806, "grad_norm": 0.326171875, "grad_norm_var": 0.0009218692779541015, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.5304148197174072, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.20401708036661148, "step": 4147 }, { "epoch": 0.06193864371094304, "grad_norm": 0.365234375, "grad_norm_var": 0.0008727391560872396, "learning_rate": 0.0001, "loss": 1.4599, "loss/crossentropy": 2.6593194007873535, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18254941701889038, "step": 4148 }, { "epoch": 0.061953575881558025, "grad_norm": 0.46875, "grad_norm_var": 0.0013910929361979167, "learning_rate": 0.0001, "loss": 1.6264, "loss/crossentropy": 2.4898438453674316, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.2005896419286728, "step": 4149 }, { "epoch": 0.06196850805217301, "grad_norm": 0.44140625, "grad_norm_var": 0.0016176859537760417, "learning_rate": 0.0001, "loss": 1.5757, "loss/crossentropy": 2.8490949869155884, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20851297676563263, "step": 4150 }, { "epoch": 0.06198344022278798, "grad_norm": 0.412109375, "grad_norm_var": 0.0014893690745035808, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.494694471359253, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.19730807840824127, "step": 4151 }, { "epoch": 0.061998372393402966, "grad_norm": 0.396484375, "grad_norm_var": 0.001289812723795573, "learning_rate": 0.0001, "loss": 1.6186, "loss/crossentropy": 2.54204261302948, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.2045750916004181, "step": 4152 }, { "epoch": 0.06201330456401795, "grad_norm": 0.376953125, "grad_norm_var": 0.0012997786204020183, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.6609787940979004, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18818890303373337, "step": 4153 }, { "epoch": 0.06202823673463293, "grad_norm": 0.484375, "grad_norm_var": 0.0018279393513997396, "learning_rate": 0.0001, "loss": 1.6147, "loss/crossentropy": 2.8299832344055176, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20456796884536743, "step": 4154 }, { "epoch": 0.062043168905247914, "grad_norm": 0.421875, "grad_norm_var": 0.0018546899159749349, "learning_rate": 0.0001, "loss": 1.8202, "loss/crossentropy": 3.079415202140808, "loss/fcd": 1.5390625, "loss/idx": 9.0, "loss/logits": 0.28117597103118896, "step": 4155 }, { "epoch": 0.06205810107586289, "grad_norm": 0.55859375, "grad_norm_var": 0.003418715794881185, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.6787418127059937, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.2065303847193718, "step": 4156 }, { "epoch": 0.06207303324647787, "grad_norm": 0.392578125, "grad_norm_var": 0.0034189860026041666, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.6063873767852783, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19260701537132263, "step": 4157 }, { "epoch": 0.062087965417092855, "grad_norm": 0.578125, "grad_norm_var": 0.005220540364583333, "learning_rate": 0.0001, "loss": 1.7056, "loss/crossentropy": 2.3473074436187744, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.21730561554431915, "step": 4158 }, { "epoch": 0.06210289758770784, "grad_norm": 0.359375, "grad_norm_var": 0.00520475705464681, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.3895211219787598, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.16687827557325363, "step": 4159 }, { "epoch": 0.06211782975832282, "grad_norm": 0.443359375, "grad_norm_var": 0.00492399533589681, "learning_rate": 0.0001, "loss": 1.6269, "loss/crossentropy": 2.7047520875930786, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.22061675786972046, "step": 4160 }, { "epoch": 0.0621327619289378, "grad_norm": 0.412109375, "grad_norm_var": 0.00463860829671224, "learning_rate": 0.0001, "loss": 1.5779, "loss/crossentropy": 2.6993660926818848, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21466702222824097, "step": 4161 }, { "epoch": 0.06214769409955278, "grad_norm": 0.41796875, "grad_norm_var": 0.0046238581339518225, "learning_rate": 0.0001, "loss": 1.6603, "loss/crossentropy": 2.5675097703933716, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.2149762436747551, "step": 4162 }, { "epoch": 0.06216262627016776, "grad_norm": 0.333984375, "grad_norm_var": 0.00452111562093099, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.5062843561172485, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.16233661770820618, "step": 4163 }, { "epoch": 0.062177558440782744, "grad_norm": 0.341796875, "grad_norm_var": 0.00475457509358724, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.538065791130066, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.16308461129665375, "step": 4164 }, { "epoch": 0.06219249061139773, "grad_norm": 0.431640625, "grad_norm_var": 0.004636494318644205, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.5315290689468384, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.17231037467718124, "step": 4165 }, { "epoch": 0.06220742278201271, "grad_norm": 0.39453125, "grad_norm_var": 0.004672352472941081, "learning_rate": 0.0001, "loss": 1.5109, "loss/crossentropy": 2.832209348678589, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19453469663858414, "step": 4166 }, { "epoch": 0.062222354952627686, "grad_norm": 0.5625, "grad_norm_var": 0.005882771809895834, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.5380877256393433, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2226310297846794, "step": 4167 }, { "epoch": 0.06223728712324267, "grad_norm": 0.390625, "grad_norm_var": 0.005912383397420247, "learning_rate": 0.0001, "loss": 1.5213, "loss/crossentropy": 2.5291827917099, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19315306097269058, "step": 4168 }, { "epoch": 0.06225221929385765, "grad_norm": 0.3671875, "grad_norm_var": 0.00598907470703125, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.498613119125366, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.20626850426197052, "step": 4169 }, { "epoch": 0.062267151464472634, "grad_norm": 0.361328125, "grad_norm_var": 0.0060541629791259766, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.6224172115325928, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1800379902124405, "step": 4170 }, { "epoch": 0.062282083635087616, "grad_norm": 0.46484375, "grad_norm_var": 0.006163263320922851, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.658485174179077, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16717422008514404, "step": 4171 }, { "epoch": 0.0622970158057026, "grad_norm": 0.396484375, "grad_norm_var": 0.004932403564453125, "learning_rate": 0.0001, "loss": 1.5219, "loss/crossentropy": 2.5674718618392944, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.18595310300588608, "step": 4172 }, { "epoch": 0.062311947976317575, "grad_norm": 0.439453125, "grad_norm_var": 0.004926300048828125, "learning_rate": 0.0001, "loss": 1.7118, "loss/crossentropy": 2.395804762840271, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.20010050386190414, "step": 4173 }, { "epoch": 0.06232688014693256, "grad_norm": 0.380859375, "grad_norm_var": 0.0031588077545166016, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.618768811225891, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19245965778827667, "step": 4174 }, { "epoch": 0.06234181231754754, "grad_norm": 0.359375, "grad_norm_var": 0.0031588077545166016, "learning_rate": 0.0001, "loss": 1.5398, "loss/crossentropy": 2.8295230865478516, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19996654987335205, "step": 4175 }, { "epoch": 0.06235674448816252, "grad_norm": 0.392578125, "grad_norm_var": 0.0030678908030192056, "learning_rate": 0.0001, "loss": 1.474, "loss/crossentropy": 2.554890275001526, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17708275467157364, "step": 4176 }, { "epoch": 0.062371676658777506, "grad_norm": 0.373046875, "grad_norm_var": 0.003115574518839518, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.4854588508605957, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19843821227550507, "step": 4177 }, { "epoch": 0.06238660882939248, "grad_norm": 0.63671875, "grad_norm_var": 0.006615432103474935, "learning_rate": 0.0001, "loss": 1.8812, "loss/crossentropy": 2.6901737451553345, "loss/fcd": 1.62890625, "loss/idx": 9.0, "loss/logits": 0.25225780159235, "step": 4178 }, { "epoch": 0.062401541000007464, "grad_norm": 0.431640625, "grad_norm_var": 0.006167205174763998, "learning_rate": 0.0001, "loss": 1.5938, "loss/crossentropy": 2.6709905862808228, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.20320633053779602, "step": 4179 }, { "epoch": 0.06241647317062245, "grad_norm": 0.412109375, "grad_norm_var": 0.00574034055074056, "learning_rate": 0.0001, "loss": 1.489, "loss/crossentropy": 2.7460575103759766, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.19603929668664932, "step": 4180 }, { "epoch": 0.06243140534123743, "grad_norm": 0.443359375, "grad_norm_var": 0.005759795506795247, "learning_rate": 0.0001, "loss": 1.4987, "loss/crossentropy": 2.6076802015304565, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.17055176943540573, "step": 4181 }, { "epoch": 0.06244633751185241, "grad_norm": 0.3515625, "grad_norm_var": 0.006052128473917643, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.4675780534744263, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.16384821385145187, "step": 4182 }, { "epoch": 0.062461269682467395, "grad_norm": 0.33984375, "grad_norm_var": 0.005001179377237956, "learning_rate": 0.0001, "loss": 1.7007, "loss/crossentropy": 2.440733551979065, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2593073546886444, "step": 4183 }, { "epoch": 0.06247620185308237, "grad_norm": 0.40234375, "grad_norm_var": 0.004981342951456706, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.844694137573242, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.21452739089727402, "step": 4184 }, { "epoch": 0.06249113402369735, "grad_norm": 0.3671875, "grad_norm_var": 0.004981342951456706, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.7244322299957275, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18454435467720032, "step": 4185 }, { "epoch": 0.06250606619431234, "grad_norm": 0.39453125, "grad_norm_var": 0.0048367818196614586, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.5717486143112183, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.22499637305736542, "step": 4186 }, { "epoch": 0.06252099836492732, "grad_norm": 0.38671875, "grad_norm_var": 0.004663848876953125, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.617864727973938, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19621144235134125, "step": 4187 }, { "epoch": 0.0625359305355423, "grad_norm": 0.357421875, "grad_norm_var": 0.0048126220703125, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.6894606351852417, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16020818054676056, "step": 4188 }, { "epoch": 0.06255086270615728, "grad_norm": 0.53515625, "grad_norm_var": 0.005833673477172852, "learning_rate": 0.0001, "loss": 1.9954, "loss/crossentropy": 2.7962028980255127, "loss/fcd": 1.62890625, "loss/idx": 9.0, "loss/logits": 0.3665277808904648, "step": 4189 }, { "epoch": 0.06256579487677226, "grad_norm": 0.373046875, "grad_norm_var": 0.0058681329091389975, "learning_rate": 0.0001, "loss": 1.5987, "loss/crossentropy": 2.6936161518096924, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21194668114185333, "step": 4190 }, { "epoch": 0.06258072704738725, "grad_norm": 0.328125, "grad_norm_var": 0.006139230728149414, "learning_rate": 0.0001, "loss": 1.395, "loss/crossentropy": 2.666784644126892, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.16846365481615067, "step": 4191 }, { "epoch": 0.06259565921800223, "grad_norm": 0.4921875, "grad_norm_var": 0.00655670166015625, "learning_rate": 0.0001, "loss": 1.5867, "loss/crossentropy": 2.8010789155960083, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.19997135549783707, "step": 4192 }, { "epoch": 0.0626105913886172, "grad_norm": 0.353515625, "grad_norm_var": 0.006687355041503906, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.647846221923828, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1704244166612625, "step": 4193 }, { "epoch": 0.06262552355923219, "grad_norm": 0.73046875, "grad_norm_var": 0.010035133361816407, "learning_rate": 0.0001, "loss": 1.6927, "loss/crossentropy": 2.61002516746521, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.2317432016134262, "step": 4194 }, { "epoch": 0.06264045572984717, "grad_norm": 0.375, "grad_norm_var": 0.01013792355855306, "learning_rate": 0.0001, "loss": 1.525, "loss/crossentropy": 2.6543556451797485, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.2008216232061386, "step": 4195 }, { "epoch": 0.06265538790046216, "grad_norm": 0.353515625, "grad_norm_var": 0.010376342137654622, "learning_rate": 0.0001, "loss": 1.5203, "loss/crossentropy": 2.4380463361740112, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18826710432767868, "step": 4196 }, { "epoch": 0.06267032007107713, "grad_norm": 0.3828125, "grad_norm_var": 0.010348256429036458, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.4141619205474854, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.17352447658777237, "step": 4197 }, { "epoch": 0.06268525224169211, "grad_norm": 0.4609375, "grad_norm_var": 0.010277048746744791, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.5594491958618164, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17189256846904755, "step": 4198 }, { "epoch": 0.0627001844123071, "grad_norm": 0.41796875, "grad_norm_var": 0.009880320231119791, "learning_rate": 0.0001, "loss": 1.6506, "loss/crossentropy": 2.6020723581314087, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.23259460180997849, "step": 4199 }, { "epoch": 0.06271511658292207, "grad_norm": 0.412109375, "grad_norm_var": 0.009864028294881184, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.54639995098114, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17231156677007675, "step": 4200 }, { "epoch": 0.06273004875353706, "grad_norm": 0.361328125, "grad_norm_var": 0.00990746815999349, "learning_rate": 0.0001, "loss": 1.4868, "loss/crossentropy": 2.614685535430908, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17819322645664215, "step": 4201 }, { "epoch": 0.06274498092415204, "grad_norm": 0.40234375, "grad_norm_var": 0.009885088602701823, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.532719135284424, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17658502608537674, "step": 4202 }, { "epoch": 0.06275991309476701, "grad_norm": 0.39453125, "grad_norm_var": 0.00985406239827474, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.21194189786911, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17566804587841034, "step": 4203 }, { "epoch": 0.062774845265382, "grad_norm": 0.353515625, "grad_norm_var": 0.009887949625651041, "learning_rate": 0.0001, "loss": 1.4681, "loss/crossentropy": 2.4155216217041016, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1712505966424942, "step": 4204 }, { "epoch": 0.06278977743599698, "grad_norm": 0.380859375, "grad_norm_var": 0.009015258153279622, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.6804338693618774, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16788608580827713, "step": 4205 }, { "epoch": 0.06280470960661197, "grad_norm": 0.388671875, "grad_norm_var": 0.008951934178670247, "learning_rate": 0.0001, "loss": 1.4506, "loss/crossentropy": 2.632022261619568, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17719624191522598, "step": 4206 }, { "epoch": 0.06281964177722694, "grad_norm": 0.40625, "grad_norm_var": 0.00846238136291504, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.617356300354004, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1920207291841507, "step": 4207 }, { "epoch": 0.06283457394784193, "grad_norm": 0.37890625, "grad_norm_var": 0.008123127619425456, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.516735315322876, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.1471373587846756, "step": 4208 }, { "epoch": 0.06284950611845691, "grad_norm": 0.40234375, "grad_norm_var": 0.007907358805338542, "learning_rate": 0.0001, "loss": 1.5313, "loss/crossentropy": 2.601340413093567, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18757174164056778, "step": 4209 }, { "epoch": 0.06286443828907189, "grad_norm": 0.453125, "grad_norm_var": 0.0009602228800455729, "learning_rate": 0.0001, "loss": 1.5923, "loss/crossentropy": 2.460425853729248, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19384414702653885, "step": 4210 }, { "epoch": 0.06287937045968688, "grad_norm": 0.365234375, "grad_norm_var": 0.0009925683339436849, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.631314992904663, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.1625884547829628, "step": 4211 }, { "epoch": 0.06289430263030185, "grad_norm": 0.388671875, "grad_norm_var": 0.0008769830067952474, "learning_rate": 0.0001, "loss": 1.642, "loss/crossentropy": 2.6938732862472534, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.2318340316414833, "step": 4212 }, { "epoch": 0.06290923480091684, "grad_norm": 0.328125, "grad_norm_var": 0.0011662642161051433, "learning_rate": 0.0001, "loss": 1.5741, "loss/crossentropy": 2.49255907535553, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19909676164388657, "step": 4213 }, { "epoch": 0.06292416697153182, "grad_norm": 0.474609375, "grad_norm_var": 0.0013010025024414063, "learning_rate": 0.0001, "loss": 1.6087, "loss/crossentropy": 2.719289183616638, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.23760594427585602, "step": 4214 }, { "epoch": 0.06293909914214679, "grad_norm": 0.384765625, "grad_norm_var": 0.0012650648752848307, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.639672875404358, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.2135343849658966, "step": 4215 }, { "epoch": 0.06295403131276178, "grad_norm": 0.361328125, "grad_norm_var": 0.001291513442993164, "learning_rate": 0.0001, "loss": 1.4732, "loss/crossentropy": 2.4380663633346558, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18799610435962677, "step": 4216 }, { "epoch": 0.06296896348337676, "grad_norm": 0.3671875, "grad_norm_var": 0.0012720108032226562, "learning_rate": 0.0001, "loss": 1.503, "loss/crossentropy": 2.436497688293457, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19439401477575302, "step": 4217 }, { "epoch": 0.06298389565399175, "grad_norm": 0.337890625, "grad_norm_var": 0.001420450210571289, "learning_rate": 0.0001, "loss": 1.5396, "loss/crossentropy": 2.538195490837097, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20368294417858124, "step": 4218 }, { "epoch": 0.06299882782460672, "grad_norm": 0.369140625, "grad_norm_var": 0.00142974853515625, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.511701226234436, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.17916923761367798, "step": 4219 }, { "epoch": 0.0630137599952217, "grad_norm": 0.37109375, "grad_norm_var": 0.0013781070709228515, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.573077082633972, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1835201159119606, "step": 4220 }, { "epoch": 0.06302869216583669, "grad_norm": 0.42578125, "grad_norm_var": 0.0014801025390625, "learning_rate": 0.0001, "loss": 1.6485, "loss/crossentropy": 2.25791597366333, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2266004979610443, "step": 4221 }, { "epoch": 0.06304362433645166, "grad_norm": 0.349609375, "grad_norm_var": 0.0015703837076822917, "learning_rate": 0.0001, "loss": 1.5316, "loss/crossentropy": 2.5472071170806885, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.2034754753112793, "step": 4222 }, { "epoch": 0.06305855650706665, "grad_norm": 0.34375, "grad_norm_var": 0.001639556884765625, "learning_rate": 0.0001, "loss": 1.5158, "loss/crossentropy": 2.548830032348633, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.18768948316574097, "step": 4223 }, { "epoch": 0.06307348867768163, "grad_norm": 0.36328125, "grad_norm_var": 0.0016599019368489583, "learning_rate": 0.0001, "loss": 1.487, "loss/crossentropy": 2.431180477142334, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17453670501708984, "step": 4224 }, { "epoch": 0.0630884208482966, "grad_norm": 0.42578125, "grad_norm_var": 0.0017628987630208333, "learning_rate": 0.0001, "loss": 1.7192, "loss/crossentropy": 2.5905650854110718, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.254387803375721, "step": 4225 }, { "epoch": 0.0631033530189116, "grad_norm": 0.51171875, "grad_norm_var": 0.0025344212849934896, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 2.5199122428894043, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.19869928061962128, "step": 4226 }, { "epoch": 0.06311828518952657, "grad_norm": 0.40625, "grad_norm_var": 0.0025287469228108725, "learning_rate": 0.0001, "loss": 1.7173, "loss/crossentropy": 2.479785680770874, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.23686932027339935, "step": 4227 }, { "epoch": 0.06313321736014156, "grad_norm": 0.37109375, "grad_norm_var": 0.0025466283162434897, "learning_rate": 0.0001, "loss": 1.487, "loss/crossentropy": 2.6450949907302856, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.19009708613157272, "step": 4228 }, { "epoch": 0.06314814953075654, "grad_norm": 0.412109375, "grad_norm_var": 0.0023286024729410808, "learning_rate": 0.0001, "loss": 1.6831, "loss/crossentropy": 2.731441855430603, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.22993410378694534, "step": 4229 }, { "epoch": 0.06316308170137153, "grad_norm": 0.38671875, "grad_norm_var": 0.0018458048502604167, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.859150767326355, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19974661618471146, "step": 4230 }, { "epoch": 0.0631780138719865, "grad_norm": 0.337890625, "grad_norm_var": 0.001995340983072917, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.55178165435791, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17349987477064133, "step": 4231 }, { "epoch": 0.06319294604260148, "grad_norm": 0.34375, "grad_norm_var": 0.0020672957102457683, "learning_rate": 0.0001, "loss": 1.4228, "loss/crossentropy": 2.6316564083099365, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16888638585805893, "step": 4232 }, { "epoch": 0.06320787821321647, "grad_norm": 0.349609375, "grad_norm_var": 0.002122942606608073, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.603370428085327, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.16668933629989624, "step": 4233 }, { "epoch": 0.06322281038383144, "grad_norm": 0.36328125, "grad_norm_var": 0.0020152886708577475, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.631303310394287, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19342782348394394, "step": 4234 }, { "epoch": 0.06323774255444643, "grad_norm": 0.451171875, "grad_norm_var": 0.0022823174794514975, "learning_rate": 0.0001, "loss": 1.6549, "loss/crossentropy": 2.4850032329559326, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.2135113924741745, "step": 4235 }, { "epoch": 0.06325267472506141, "grad_norm": 0.734375, "grad_norm_var": 0.00969694455464681, "learning_rate": 0.0001, "loss": 2.0902, "loss/crossentropy": 2.289637327194214, "loss/fcd": 1.74609375, "loss/idx": 9.0, "loss/logits": 0.3441494405269623, "step": 4236 }, { "epoch": 0.06326760689567638, "grad_norm": 0.37890625, "grad_norm_var": 0.009741957982381184, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.3935922384262085, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1610722616314888, "step": 4237 }, { "epoch": 0.06328253906629137, "grad_norm": 0.3671875, "grad_norm_var": 0.009624226888020834, "learning_rate": 0.0001, "loss": 1.4842, "loss/crossentropy": 2.84609317779541, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1873488873243332, "step": 4238 }, { "epoch": 0.06329747123690635, "grad_norm": 0.375, "grad_norm_var": 0.009412638346354167, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.6419055461883545, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.14703209698200226, "step": 4239 }, { "epoch": 0.06331240340752134, "grad_norm": 0.39453125, "grad_norm_var": 0.0092742919921875, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.1855770349502563, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.18636227399110794, "step": 4240 }, { "epoch": 0.06332733557813632, "grad_norm": 0.3515625, "grad_norm_var": 0.009492937723795574, "learning_rate": 0.0001, "loss": 1.5822, "loss/crossentropy": 2.671903610229492, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20723819732666016, "step": 4241 }, { "epoch": 0.06334226774875129, "grad_norm": 0.384765625, "grad_norm_var": 0.008752171198527019, "learning_rate": 0.0001, "loss": 1.5769, "loss/crossentropy": 2.6896839141845703, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20189445465803146, "step": 4242 }, { "epoch": 0.06335719991936628, "grad_norm": 0.419921875, "grad_norm_var": 0.008774312337239583, "learning_rate": 0.0001, "loss": 1.5198, "loss/crossentropy": 2.6977410316467285, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19165275990962982, "step": 4243 }, { "epoch": 0.06337213208998126, "grad_norm": 0.38671875, "grad_norm_var": 0.00872650146484375, "learning_rate": 0.0001, "loss": 1.6064, "loss/crossentropy": 2.5201679468154907, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.20010517537593842, "step": 4244 }, { "epoch": 0.06338706426059625, "grad_norm": 0.326171875, "grad_norm_var": 0.009076182047526042, "learning_rate": 0.0001, "loss": 1.5629, "loss/crossentropy": 2.5325485467910767, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20355278998613358, "step": 4245 }, { "epoch": 0.06340199643121122, "grad_norm": 0.341796875, "grad_norm_var": 0.009263722101847331, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.6329153776168823, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.16985872387886047, "step": 4246 }, { "epoch": 0.06341692860182621, "grad_norm": 0.4921875, "grad_norm_var": 0.009593963623046875, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.316709876060486, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2373836785554886, "step": 4247 }, { "epoch": 0.06343186077244119, "grad_norm": 0.427734375, "grad_norm_var": 0.009362268447875976, "learning_rate": 0.0001, "loss": 1.725, "loss/crossentropy": 2.634486675262451, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.24840524047613144, "step": 4248 }, { "epoch": 0.06344679294305616, "grad_norm": 0.318359375, "grad_norm_var": 0.00967100461324056, "learning_rate": 0.0001, "loss": 1.486, "loss/crossentropy": 2.628062605857849, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18910907208919525, "step": 4249 }, { "epoch": 0.06346172511367115, "grad_norm": 0.392578125, "grad_norm_var": 0.009553464253743489, "learning_rate": 0.0001, "loss": 1.4407, "loss/crossentropy": 2.778687834739685, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.1829102486371994, "step": 4250 }, { "epoch": 0.06347665728428613, "grad_norm": 1.1953125, "grad_norm_var": 0.048353179295857744, "learning_rate": 0.0001, "loss": 1.8291, "loss/crossentropy": 2.377539038658142, "loss/fcd": 1.5546875, "loss/idx": 9.0, "loss/logits": 0.274421826004982, "step": 4251 }, { "epoch": 0.06349158945490112, "grad_norm": 0.34375, "grad_norm_var": 0.04336228370666504, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.4790674448013306, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.16717875003814697, "step": 4252 }, { "epoch": 0.0635065216255161, "grad_norm": 0.416015625, "grad_norm_var": 0.04319044748942057, "learning_rate": 0.0001, "loss": 1.7508, "loss/crossentropy": 2.387219190597534, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.24693743139505386, "step": 4253 }, { "epoch": 0.06352145379613107, "grad_norm": 0.365234375, "grad_norm_var": 0.04320791562398275, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.5024991035461426, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.20419470220804214, "step": 4254 }, { "epoch": 0.06353638596674606, "grad_norm": 0.46484375, "grad_norm_var": 0.04301489194234212, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.4802403450012207, "loss/fcd": 1.4453125, "loss/idx": 9.0, "loss/logits": 0.22219746559858322, "step": 4255 }, { "epoch": 0.06355131813736103, "grad_norm": 0.412109375, "grad_norm_var": 0.042930348714192705, "learning_rate": 0.0001, "loss": 1.6485, "loss/crossentropy": 2.6581904888153076, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2188379019498825, "step": 4256 }, { "epoch": 0.06356625030797602, "grad_norm": 0.349609375, "grad_norm_var": 0.04295360247294108, "learning_rate": 0.0001, "loss": 1.74, "loss/crossentropy": 2.3880937099456787, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.2595532089471817, "step": 4257 }, { "epoch": 0.063581182478591, "grad_norm": 0.361328125, "grad_norm_var": 0.043159977595011396, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.4806575775146484, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16257119551301003, "step": 4258 }, { "epoch": 0.06359611464920598, "grad_norm": 0.4375, "grad_norm_var": 0.04313608805338542, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.631749153137207, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.2109185755252838, "step": 4259 }, { "epoch": 0.06361104681982097, "grad_norm": 0.453125, "grad_norm_var": 0.042944780985514325, "learning_rate": 0.0001, "loss": 1.5274, "loss/crossentropy": 2.6769416332244873, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18368291854858398, "step": 4260 }, { "epoch": 0.06362597899043594, "grad_norm": 0.35546875, "grad_norm_var": 0.04253970781962077, "learning_rate": 0.0001, "loss": 1.5136, "loss/crossentropy": 2.6963335275650024, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19330129772424698, "step": 4261 }, { "epoch": 0.06364091116105093, "grad_norm": 0.470703125, "grad_norm_var": 0.04179698626200358, "learning_rate": 0.0001, "loss": 1.7562, "loss/crossentropy": 2.8776679039001465, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.22108887135982513, "step": 4262 }, { "epoch": 0.0636558433316659, "grad_norm": 0.373046875, "grad_norm_var": 0.042069435119628906, "learning_rate": 0.0001, "loss": 1.6552, "loss/crossentropy": 2.5102500915527344, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2254875749349594, "step": 4263 }, { "epoch": 0.06367077550228088, "grad_norm": 0.46484375, "grad_norm_var": 0.04206490516662598, "learning_rate": 0.0001, "loss": 1.726, "loss/crossentropy": 2.7836469411849976, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.22989127784967422, "step": 4264 }, { "epoch": 0.06368570767289587, "grad_norm": 0.37890625, "grad_norm_var": 0.0412445068359375, "learning_rate": 0.0001, "loss": 1.6156, "loss/crossentropy": 2.5288264751434326, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.19760461151599884, "step": 4265 }, { "epoch": 0.06370063984351085, "grad_norm": 0.3984375, "grad_norm_var": 0.041200113296508786, "learning_rate": 0.0001, "loss": 1.7635, "loss/crossentropy": 2.51729953289032, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.24784618616104126, "step": 4266 }, { "epoch": 0.06371557201412584, "grad_norm": 0.36328125, "grad_norm_var": 0.0020632266998291014, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.7544749975204468, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.2000599429011345, "step": 4267 }, { "epoch": 0.06373050418474081, "grad_norm": 0.474609375, "grad_norm_var": 0.002143096923828125, "learning_rate": 0.0001, "loss": 1.7054, "loss/crossentropy": 2.4427123069763184, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.21323615312576294, "step": 4268 }, { "epoch": 0.0637454363553558, "grad_norm": 0.388671875, "grad_norm_var": 0.002163124084472656, "learning_rate": 0.0001, "loss": 1.5497, "loss/crossentropy": 2.5214831829071045, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20208685845136642, "step": 4269 }, { "epoch": 0.06376036852597078, "grad_norm": 0.349609375, "grad_norm_var": 0.0022653579711914063, "learning_rate": 0.0001, "loss": 1.5653, "loss/crossentropy": 2.6016972064971924, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19807928800582886, "step": 4270 }, { "epoch": 0.06377530069658575, "grad_norm": 0.353515625, "grad_norm_var": 0.002166604995727539, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.5714287757873535, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.19015280902385712, "step": 4271 }, { "epoch": 0.06379023286720074, "grad_norm": 0.3515625, "grad_norm_var": 0.0022902806599934896, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.489862322807312, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17383597791194916, "step": 4272 }, { "epoch": 0.06380516503781572, "grad_norm": 0.3671875, "grad_norm_var": 0.002202590306599935, "learning_rate": 0.0001, "loss": 1.5549, "loss/crossentropy": 2.7072607278823853, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1955701932311058, "step": 4273 }, { "epoch": 0.06382009720843071, "grad_norm": 0.326171875, "grad_norm_var": 0.0024440606435139974, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.455703616142273, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.1713200807571411, "step": 4274 }, { "epoch": 0.06383502937904569, "grad_norm": 0.4140625, "grad_norm_var": 0.002342971165974935, "learning_rate": 0.0001, "loss": 1.6445, "loss/crossentropy": 2.4691309928894043, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.20698190480470657, "step": 4275 }, { "epoch": 0.06384996154966066, "grad_norm": 0.5078125, "grad_norm_var": 0.0029704888661702474, "learning_rate": 0.0001, "loss": 1.6069, "loss/crossentropy": 2.4944599866867065, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.1889009103178978, "step": 4276 }, { "epoch": 0.06386489372027565, "grad_norm": 0.37890625, "grad_norm_var": 0.00287779172261556, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.318110942840576, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.15523862838745117, "step": 4277 }, { "epoch": 0.06387982589089063, "grad_norm": 0.4140625, "grad_norm_var": 0.002526092529296875, "learning_rate": 0.0001, "loss": 1.604, "loss/crossentropy": 2.5452606678009033, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.23286797106266022, "step": 4278 }, { "epoch": 0.06389475806150562, "grad_norm": 0.353515625, "grad_norm_var": 0.0026046117146809895, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.6713099479675293, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1997172236442566, "step": 4279 }, { "epoch": 0.06390969023212059, "grad_norm": 0.58203125, "grad_norm_var": 0.0045882542928059895, "learning_rate": 0.0001, "loss": 1.6881, "loss/crossentropy": 2.9002346992492676, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2544946223497391, "step": 4280 }, { "epoch": 0.06392462240273557, "grad_norm": 0.44921875, "grad_norm_var": 0.004698117574055989, "learning_rate": 0.0001, "loss": 1.6482, "loss/crossentropy": 2.3703967332839966, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.21456165611743927, "step": 4281 }, { "epoch": 0.06393955457335056, "grad_norm": 0.408203125, "grad_norm_var": 0.0046961307525634766, "learning_rate": 0.0001, "loss": 1.5201, "loss/crossentropy": 2.6363041400909424, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1919921264052391, "step": 4282 }, { "epoch": 0.06395448674396553, "grad_norm": 0.361328125, "grad_norm_var": 0.00470727284749349, "learning_rate": 0.0001, "loss": 1.5171, "loss/crossentropy": 2.5720503330230713, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.20852426439523697, "step": 4283 }, { "epoch": 0.06396941891458052, "grad_norm": 0.458984375, "grad_norm_var": 0.00457757314046224, "learning_rate": 0.0001, "loss": 1.6092, "loss/crossentropy": 2.7299610376358032, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.19908416271209717, "step": 4284 }, { "epoch": 0.0639843510851955, "grad_norm": 0.578125, "grad_norm_var": 0.006432326634724935, "learning_rate": 0.0001, "loss": 1.7352, "loss/crossentropy": 2.3763370513916016, "loss/fcd": 1.52734375, "loss/idx": 9.0, "loss/logits": 0.20788905769586563, "step": 4285 }, { "epoch": 0.06399928325581047, "grad_norm": 0.447265625, "grad_norm_var": 0.006165297826131185, "learning_rate": 0.0001, "loss": 1.4744, "loss/crossentropy": 2.659751772880554, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18138743191957474, "step": 4286 }, { "epoch": 0.06401421542642546, "grad_norm": 0.41796875, "grad_norm_var": 0.00583642323811849, "learning_rate": 0.0001, "loss": 1.7105, "loss/crossentropy": 2.495036482810974, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.22999779880046844, "step": 4287 }, { "epoch": 0.06402914759704044, "grad_norm": 0.33984375, "grad_norm_var": 0.005961354573567708, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.663183569908142, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17942088842391968, "step": 4288 }, { "epoch": 0.06404407976765543, "grad_norm": 0.416015625, "grad_norm_var": 0.0057320753733317055, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.774498462677002, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19773419201374054, "step": 4289 }, { "epoch": 0.0640590119382704, "grad_norm": 0.36328125, "grad_norm_var": 0.005312601725260417, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.464685559272766, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.2048199474811554, "step": 4290 }, { "epoch": 0.0640739441088854, "grad_norm": 0.431640625, "grad_norm_var": 0.005293003718058268, "learning_rate": 0.0001, "loss": 1.7114, "loss/crossentropy": 2.679101586341858, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.2543264701962471, "step": 4291 }, { "epoch": 0.06408887627950037, "grad_norm": 0.408203125, "grad_norm_var": 0.004903093973795573, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.554627537727356, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.22389238327741623, "step": 4292 }, { "epoch": 0.06410380845011535, "grad_norm": 0.39453125, "grad_norm_var": 0.004821205139160156, "learning_rate": 0.0001, "loss": 1.4315, "loss/crossentropy": 2.496920347213745, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1620081439614296, "step": 4293 }, { "epoch": 0.06411874062073034, "grad_norm": 0.435546875, "grad_norm_var": 0.0048143863677978516, "learning_rate": 0.0001, "loss": 1.8125, "loss/crossentropy": 2.4399315118789673, "loss/fcd": 1.57421875, "loss/idx": 9.0, "loss/logits": 0.23825812339782715, "step": 4294 }, { "epoch": 0.06413367279134531, "grad_norm": 0.384765625, "grad_norm_var": 0.004565668106079101, "learning_rate": 0.0001, "loss": 1.5963, "loss/crossentropy": 2.557780146598816, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.20962286740541458, "step": 4295 }, { "epoch": 0.0641486049619603, "grad_norm": 0.359375, "grad_norm_var": 0.0031450748443603515, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.5232813358306885, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.17305073142051697, "step": 4296 }, { "epoch": 0.06416353713257528, "grad_norm": 0.3515625, "grad_norm_var": 0.003307199478149414, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.5480986833572388, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.15957307070493698, "step": 4297 }, { "epoch": 0.06417846930319025, "grad_norm": 0.388671875, "grad_norm_var": 0.0033351739247639975, "learning_rate": 0.0001, "loss": 1.4764, "loss/crossentropy": 2.6509591341018677, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1951727420091629, "step": 4298 }, { "epoch": 0.06419340147380524, "grad_norm": 0.3671875, "grad_norm_var": 0.0033004124959309894, "learning_rate": 0.0001, "loss": 1.5431, "loss/crossentropy": 2.5142061710357666, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1837358996272087, "step": 4299 }, { "epoch": 0.06420833364442022, "grad_norm": 0.421875, "grad_norm_var": 0.0031388441721598307, "learning_rate": 0.0001, "loss": 1.5665, "loss/crossentropy": 2.6317306756973267, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19145788252353668, "step": 4300 }, { "epoch": 0.06422326581503521, "grad_norm": 0.349609375, "grad_norm_var": 0.0011768976847330729, "learning_rate": 0.0001, "loss": 1.4818, "loss/crossentropy": 2.730645537376404, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18880850821733475, "step": 4301 }, { "epoch": 0.06423819798565018, "grad_norm": 0.52734375, "grad_norm_var": 0.002164189020792643, "learning_rate": 0.0001, "loss": 2.0073, "loss/crossentropy": 2.5344929695129395, "loss/fcd": 1.69921875, "loss/idx": 9.0, "loss/logits": 0.3080381155014038, "step": 4302 }, { "epoch": 0.06425313015626516, "grad_norm": 0.421875, "grad_norm_var": 0.00217588742574056, "learning_rate": 0.0001, "loss": 1.663, "loss/crossentropy": 2.5792784690856934, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.22945423424243927, "step": 4303 }, { "epoch": 0.06426806232688015, "grad_norm": 0.37109375, "grad_norm_var": 0.001996342341105143, "learning_rate": 0.0001, "loss": 1.4526, "loss/crossentropy": 2.8720319271087646, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17523255944252014, "step": 4304 }, { "epoch": 0.06428299449749512, "grad_norm": 0.361328125, "grad_norm_var": 0.0020630995432535807, "learning_rate": 0.0001, "loss": 1.5144, "loss/crossentropy": 2.6480159759521484, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.20189521461725235, "step": 4305 }, { "epoch": 0.06429792666811011, "grad_norm": 0.408203125, "grad_norm_var": 0.0019925435384114585, "learning_rate": 0.0001, "loss": 1.6336, "loss/crossentropy": 2.8921741247177124, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.21950171142816544, "step": 4306 }, { "epoch": 0.06431285883872509, "grad_norm": 0.357421875, "grad_norm_var": 0.002013079325358073, "learning_rate": 0.0001, "loss": 1.5921, "loss/crossentropy": 2.4048264026641846, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.22099730372428894, "step": 4307 }, { "epoch": 0.06432779100934008, "grad_norm": 0.38671875, "grad_norm_var": 0.0020020643870035807, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.600233793258667, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18403208255767822, "step": 4308 }, { "epoch": 0.06434272317995506, "grad_norm": 0.39453125, "grad_norm_var": 0.0020020643870035807, "learning_rate": 0.0001, "loss": 1.4936, "loss/crossentropy": 2.849720001220703, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.19283714890480042, "step": 4309 }, { "epoch": 0.06435765535057003, "grad_norm": 0.37109375, "grad_norm_var": 0.001895586649576823, "learning_rate": 0.0001, "loss": 1.6484, "loss/crossentropy": 2.4465357065200806, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.21087726205587387, "step": 4310 }, { "epoch": 0.06437258752118502, "grad_norm": 0.388671875, "grad_norm_var": 0.001894378662109375, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.8132940530776978, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.19331543147563934, "step": 4311 }, { "epoch": 0.0643875196918, "grad_norm": 0.375, "grad_norm_var": 0.0018475850423177083, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.485331177711487, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19052754342556, "step": 4312 }, { "epoch": 0.06440245186241499, "grad_norm": 0.36328125, "grad_norm_var": 0.0017958958943684897, "learning_rate": 0.0001, "loss": 1.57, "loss/crossentropy": 2.4853755235671997, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.21058544516563416, "step": 4313 }, { "epoch": 0.06441738403302996, "grad_norm": 0.361328125, "grad_norm_var": 0.0018506368001302083, "learning_rate": 0.0001, "loss": 1.488, "loss/crossentropy": 2.7370649576187134, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.19115576893091202, "step": 4314 }, { "epoch": 0.06443231620364494, "grad_norm": 0.390625, "grad_norm_var": 0.0018163045247395833, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.511844277381897, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16123904287815094, "step": 4315 }, { "epoch": 0.06444724837425993, "grad_norm": 0.38671875, "grad_norm_var": 0.0017470677693684896, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.673948645591736, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20025160163640976, "step": 4316 }, { "epoch": 0.0644621805448749, "grad_norm": 0.40625, "grad_norm_var": 0.0016544183095296224, "learning_rate": 0.0001, "loss": 1.7934, "loss/crossentropy": 2.5966535806655884, "loss/fcd": 1.53515625, "loss/idx": 9.0, "loss/logits": 0.2582811042666435, "step": 4317 }, { "epoch": 0.06447711271548989, "grad_norm": 0.455078125, "grad_norm_var": 0.0006764094034830729, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.358456015586853, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.19933749735355377, "step": 4318 }, { "epoch": 0.06449204488610487, "grad_norm": 0.33203125, "grad_norm_var": 0.0007685343424479167, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.507981538772583, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1688276305794716, "step": 4319 }, { "epoch": 0.06450697705671984, "grad_norm": 0.453125, "grad_norm_var": 0.001071612040201823, "learning_rate": 0.0001, "loss": 1.5568, "loss/crossentropy": 2.7670978307724, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18965008109807968, "step": 4320 }, { "epoch": 0.06452190922733483, "grad_norm": 0.44921875, "grad_norm_var": 0.0012540022532145183, "learning_rate": 0.0001, "loss": 1.5242, "loss/crossentropy": 2.8438574075698853, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19606559723615646, "step": 4321 }, { "epoch": 0.06453684139794981, "grad_norm": 0.357421875, "grad_norm_var": 0.0013085524241129556, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.585526466369629, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17822428047657013, "step": 4322 }, { "epoch": 0.0645517735685648, "grad_norm": 0.34765625, "grad_norm_var": 0.0013559977213541666, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.5040897130966187, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20801691710948944, "step": 4323 }, { "epoch": 0.06456670573917977, "grad_norm": 0.345703125, "grad_norm_var": 0.0014718214670817057, "learning_rate": 0.0001, "loss": 1.4387, "loss/crossentropy": 2.5142083168029785, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17700152099132538, "step": 4324 }, { "epoch": 0.06458163790979475, "grad_norm": 0.32421875, "grad_norm_var": 0.0017018477121988932, "learning_rate": 0.0001, "loss": 1.4124, "loss/crossentropy": 2.554168939590454, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.16239356249570847, "step": 4325 }, { "epoch": 0.06459657008040974, "grad_norm": 0.31640625, "grad_norm_var": 0.0019662062327067058, "learning_rate": 0.0001, "loss": 1.4471, "loss/crossentropy": 2.5660585165023804, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1853431686758995, "step": 4326 }, { "epoch": 0.06461150225102472, "grad_norm": 0.5234375, "grad_norm_var": 0.0032877604166666667, "learning_rate": 0.0001, "loss": 1.6549, "loss/crossentropy": 2.2927587032318115, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.17447254806756973, "step": 4327 }, { "epoch": 0.0646264344216397, "grad_norm": 0.375, "grad_norm_var": 0.0032877604166666667, "learning_rate": 0.0001, "loss": 1.6164, "loss/crossentropy": 2.5465675592422485, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.22573503851890564, "step": 4328 }, { "epoch": 0.06464136659225468, "grad_norm": 0.375, "grad_norm_var": 0.003259722391764323, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.5526533126831055, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.2316678911447525, "step": 4329 }, { "epoch": 0.06465629876286967, "grad_norm": 0.431640625, "grad_norm_var": 0.003323809305826823, "learning_rate": 0.0001, "loss": 1.6475, "loss/crossentropy": 2.5425580739974976, "loss/fcd": 1.453125, "loss/idx": 9.0, "loss/logits": 0.1943874955177307, "step": 4330 }, { "epoch": 0.06467123093348465, "grad_norm": 0.33984375, "grad_norm_var": 0.0034932454427083334, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.477900981903076, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16045044362545013, "step": 4331 }, { "epoch": 0.06468616310409962, "grad_norm": 0.365234375, "grad_norm_var": 0.0035276889801025392, "learning_rate": 0.0001, "loss": 1.4704, "loss/crossentropy": 2.6880550384521484, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18136319518089294, "step": 4332 }, { "epoch": 0.06470109527471461, "grad_norm": 0.388671875, "grad_norm_var": 0.003502655029296875, "learning_rate": 0.0001, "loss": 1.646, "loss/crossentropy": 2.6845333576202393, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.21635441482067108, "step": 4333 }, { "epoch": 0.06471602744532959, "grad_norm": 0.328125, "grad_norm_var": 0.0033445835113525392, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.543837547302246, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18822187930345535, "step": 4334 }, { "epoch": 0.06473095961594458, "grad_norm": 0.37109375, "grad_norm_var": 0.0031989892323811848, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.728258728981018, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.17376457154750824, "step": 4335 }, { "epoch": 0.06474589178655955, "grad_norm": 0.3828125, "grad_norm_var": 0.0028293450673421224, "learning_rate": 0.0001, "loss": 1.4736, "loss/crossentropy": 2.5652732849121094, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18458616733551025, "step": 4336 }, { "epoch": 0.06476082395717453, "grad_norm": 0.345703125, "grad_norm_var": 0.0024932225545247397, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.6308146715164185, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17626478523015976, "step": 4337 }, { "epoch": 0.06477575612778952, "grad_norm": 0.396484375, "grad_norm_var": 0.0025237401326497397, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.5893982648849487, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21070070564746857, "step": 4338 }, { "epoch": 0.0647906882984045, "grad_norm": 0.365234375, "grad_norm_var": 0.002485259373982747, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.7162232398986816, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.181736558675766, "step": 4339 }, { "epoch": 0.06480562046901948, "grad_norm": 0.3671875, "grad_norm_var": 0.0024347305297851562, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.62139356136322, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19947678595781326, "step": 4340 }, { "epoch": 0.06482055263963446, "grad_norm": 0.34375, "grad_norm_var": 0.00232696533203125, "learning_rate": 0.0001, "loss": 1.5345, "loss/crossentropy": 2.748238444328308, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.2063996121287346, "step": 4341 }, { "epoch": 0.06483548481024944, "grad_norm": 0.345703125, "grad_norm_var": 0.0021479129791259766, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.6614253520965576, "loss/fcd": 1.14453125, "loss/idx": 9.0, "loss/logits": 0.15084750205278397, "step": 4342 }, { "epoch": 0.06485041698086443, "grad_norm": 0.328125, "grad_norm_var": 0.0007396539052327474, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.549100399017334, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17642395198345184, "step": 4343 }, { "epoch": 0.0648653491514794, "grad_norm": 0.373046875, "grad_norm_var": 0.0007374445597330729, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.650322675704956, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.15665343403816223, "step": 4344 }, { "epoch": 0.06488028132209439, "grad_norm": 0.375, "grad_norm_var": 0.0007374445597330729, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.4588913917541504, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.22030248492956161, "step": 4345 }, { "epoch": 0.06489521349270937, "grad_norm": 0.349609375, "grad_norm_var": 0.00043436686197916664, "learning_rate": 0.0001, "loss": 1.4944, "loss/crossentropy": 2.685468077659607, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18191833794116974, "step": 4346 }, { "epoch": 0.06491014566332434, "grad_norm": 0.361328125, "grad_norm_var": 0.00040446917215983074, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.334079623222351, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.2083956077694893, "step": 4347 }, { "epoch": 0.06492507783393933, "grad_norm": 0.470703125, "grad_norm_var": 0.0011494795481363932, "learning_rate": 0.0001, "loss": 2.0345, "loss/crossentropy": 2.2699865102767944, "loss/fcd": 1.765625, "loss/idx": 9.0, "loss/logits": 0.26884782314300537, "step": 4348 }, { "epoch": 0.06494001000455431, "grad_norm": 0.3203125, "grad_norm_var": 0.0012557347615559896, "learning_rate": 0.0001, "loss": 1.5556, "loss/crossentropy": 2.46175479888916, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.2001216560602188, "step": 4349 }, { "epoch": 0.0649549421751693, "grad_norm": 0.345703125, "grad_norm_var": 0.0011909325917561849, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.566065788269043, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17921198159456253, "step": 4350 }, { "epoch": 0.06496987434578427, "grad_norm": 0.388671875, "grad_norm_var": 0.0012242635091145833, "learning_rate": 0.0001, "loss": 1.5328, "loss/crossentropy": 2.6115092039108276, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.196846604347229, "step": 4351 }, { "epoch": 0.06498480651639926, "grad_norm": 0.353515625, "grad_norm_var": 0.00121305783589681, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.6354514360427856, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.19032421708106995, "step": 4352 }, { "epoch": 0.06499973868701424, "grad_norm": 0.36328125, "grad_norm_var": 0.0011885960896809895, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.605360507965088, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1863650307059288, "step": 4353 }, { "epoch": 0.06501467085762921, "grad_norm": 0.3671875, "grad_norm_var": 0.0011211236317952473, "learning_rate": 0.0001, "loss": 1.3519, "loss/crossentropy": 2.64324688911438, "loss/fcd": 1.19140625, "loss/idx": 9.0, "loss/logits": 0.16045493632555008, "step": 4354 }, { "epoch": 0.0650296030282442, "grad_norm": 0.451171875, "grad_norm_var": 0.0016008853912353516, "learning_rate": 0.0001, "loss": 1.5748, "loss/crossentropy": 2.455973505973816, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.1920250877737999, "step": 4355 }, { "epoch": 0.06504453519885918, "grad_norm": 0.40625, "grad_norm_var": 0.0016867160797119141, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.6553317308425903, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.2228955551981926, "step": 4356 }, { "epoch": 0.06505946736947417, "grad_norm": 0.328125, "grad_norm_var": 0.0017597039540608725, "learning_rate": 0.0001, "loss": 1.4825, "loss/crossentropy": 2.5336285829544067, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18565180897712708, "step": 4357 }, { "epoch": 0.06507439954008915, "grad_norm": 0.396484375, "grad_norm_var": 0.001753091812133789, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.4500216245651245, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1818828508257866, "step": 4358 }, { "epoch": 0.06508933171070412, "grad_norm": 0.53125, "grad_norm_var": 0.0030986626942952475, "learning_rate": 0.0001, "loss": 1.6273, "loss/crossentropy": 2.6565433740615845, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.2171262949705124, "step": 4359 }, { "epoch": 0.06510426388131911, "grad_norm": 0.396484375, "grad_norm_var": 0.00309141476949056, "learning_rate": 0.0001, "loss": 1.8064, "loss/crossentropy": 2.5631723403930664, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.24002249538898468, "step": 4360 }, { "epoch": 0.06511919605193409, "grad_norm": 0.33203125, "grad_norm_var": 0.0032802422841389975, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.6031450033187866, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.17104676365852356, "step": 4361 }, { "epoch": 0.06513412822254908, "grad_norm": 0.337890625, "grad_norm_var": 0.0033443291982014974, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.560188412666321, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1856372356414795, "step": 4362 }, { "epoch": 0.06514906039316405, "grad_norm": 0.376953125, "grad_norm_var": 0.0033115228017171223, "learning_rate": 0.0001, "loss": 1.4668, "loss/crossentropy": 2.894311547279358, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.1855199635028839, "step": 4363 }, { "epoch": 0.06516399256377903, "grad_norm": 0.400390625, "grad_norm_var": 0.002820571263631185, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.697547197341919, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1756022945046425, "step": 4364 }, { "epoch": 0.06517892473439402, "grad_norm": 0.3984375, "grad_norm_var": 0.002570072809855143, "learning_rate": 0.0001, "loss": 1.4866, "loss/crossentropy": 2.6507577896118164, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18188905715942383, "step": 4365 }, { "epoch": 0.06519385690500899, "grad_norm": 0.373046875, "grad_norm_var": 0.0024703820546468098, "learning_rate": 0.0001, "loss": 1.4711, "loss/crossentropy": 2.7789559364318848, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.18201101571321487, "step": 4366 }, { "epoch": 0.06520878907562398, "grad_norm": 0.3515625, "grad_norm_var": 0.0025510152180989584, "learning_rate": 0.0001, "loss": 1.6697, "loss/crossentropy": 2.3687957525253296, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.2322419434785843, "step": 4367 }, { "epoch": 0.06522372124623896, "grad_norm": 0.375, "grad_norm_var": 0.0024889469146728515, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.5669243335723877, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18342113494873047, "step": 4368 }, { "epoch": 0.06523865341685395, "grad_norm": 0.35546875, "grad_norm_var": 0.0025170485178629557, "learning_rate": 0.0001, "loss": 1.6013, "loss/crossentropy": 2.5706944465637207, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.23408900201320648, "step": 4369 }, { "epoch": 0.06525358558746892, "grad_norm": 0.34765625, "grad_norm_var": 0.0025901635487874347, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.715335726737976, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1915472373366356, "step": 4370 }, { "epoch": 0.0652685177580839, "grad_norm": 0.349609375, "grad_norm_var": 0.0023372491200764974, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.4736835956573486, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18160487711429596, "step": 4371 }, { "epoch": 0.06528344992869889, "grad_norm": 0.44921875, "grad_norm_var": 0.002611398696899414, "learning_rate": 0.0001, "loss": 1.8946, "loss/crossentropy": 2.3323696851730347, "loss/fcd": 1.59765625, "loss/idx": 9.0, "loss/logits": 0.2969079166650772, "step": 4372 }, { "epoch": 0.06529838209931386, "grad_norm": 0.37109375, "grad_norm_var": 0.0024225711822509766, "learning_rate": 0.0001, "loss": 1.5489, "loss/crossentropy": 2.722555637359619, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2012375444173813, "step": 4373 }, { "epoch": 0.06531331426992885, "grad_norm": 0.404296875, "grad_norm_var": 0.002439483006795247, "learning_rate": 0.0001, "loss": 1.6099, "loss/crossentropy": 2.6662269830703735, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.24659846723079681, "step": 4374 }, { "epoch": 0.06532824644054383, "grad_norm": 0.43359375, "grad_norm_var": 0.0011234124501546224, "learning_rate": 0.0001, "loss": 1.6371, "loss/crossentropy": 2.6631011962890625, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.21130336821079254, "step": 4375 }, { "epoch": 0.0653431786111588, "grad_norm": 0.388671875, "grad_norm_var": 0.0011082808176676431, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.57779598236084, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.1857595145702362, "step": 4376 }, { "epoch": 0.0653581107817738, "grad_norm": 0.396484375, "grad_norm_var": 0.0009745279947916667, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.8102879524230957, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.20405247807502747, "step": 4377 }, { "epoch": 0.06537304295238877, "grad_norm": 0.369140625, "grad_norm_var": 0.0008524576822916667, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.672439217567444, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19661352038383484, "step": 4378 }, { "epoch": 0.06538797512300376, "grad_norm": 0.345703125, "grad_norm_var": 0.0009419759114583333, "learning_rate": 0.0001, "loss": 1.4522, "loss/crossentropy": 2.7121150493621826, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.1787414699792862, "step": 4379 }, { "epoch": 0.06540290729361874, "grad_norm": 0.45703125, "grad_norm_var": 0.0012826124827067058, "learning_rate": 0.0001, "loss": 1.6944, "loss/crossentropy": 2.5888746976852417, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.21389954537153244, "step": 4380 }, { "epoch": 0.06541783946423371, "grad_norm": 0.39453125, "grad_norm_var": 0.0012767632802327474, "learning_rate": 0.0001, "loss": 1.6773, "loss/crossentropy": 2.3963578939437866, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.20858927816152573, "step": 4381 }, { "epoch": 0.0654327716348487, "grad_norm": 0.333984375, "grad_norm_var": 0.0014350732167561849, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.565967559814453, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16814109683036804, "step": 4382 }, { "epoch": 0.06544770380546368, "grad_norm": 0.361328125, "grad_norm_var": 0.0014005025227864583, "learning_rate": 0.0001, "loss": 1.627, "loss/crossentropy": 2.3446320295333862, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.23634546995162964, "step": 4383 }, { "epoch": 0.06546263597607867, "grad_norm": 0.349609375, "grad_norm_var": 0.0014688968658447266, "learning_rate": 0.0001, "loss": 1.5338, "loss/crossentropy": 2.6602017879486084, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20567446947097778, "step": 4384 }, { "epoch": 0.06547756814669364, "grad_norm": 0.375, "grad_norm_var": 0.0014243920644124348, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.6043014526367188, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.1531090885400772, "step": 4385 }, { "epoch": 0.06549250031730862, "grad_norm": 0.341796875, "grad_norm_var": 0.0014540990193684896, "learning_rate": 0.0001, "loss": 1.4383, "loss/crossentropy": 2.727893829345703, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.1883363127708435, "step": 4386 }, { "epoch": 0.06550743248792361, "grad_norm": 0.38671875, "grad_norm_var": 0.001377089818318685, "learning_rate": 0.0001, "loss": 1.5469, "loss/crossentropy": 2.5697567462921143, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.20312948524951935, "step": 4387 }, { "epoch": 0.06552236465853858, "grad_norm": 0.40234375, "grad_norm_var": 0.00111234982808431, "learning_rate": 0.0001, "loss": 1.6616, "loss/crossentropy": 2.476949691772461, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.23581652343273163, "step": 4388 }, { "epoch": 0.06553729682915357, "grad_norm": 0.357421875, "grad_norm_var": 0.0011438369750976563, "learning_rate": 0.0001, "loss": 1.3658, "loss/crossentropy": 2.671204447746277, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.15093894302845, "step": 4389 }, { "epoch": 0.06555222899976855, "grad_norm": 0.359375, "grad_norm_var": 0.0011310418446858724, "learning_rate": 0.0001, "loss": 1.5886, "loss/crossentropy": 2.737646460533142, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.22533488273620605, "step": 4390 }, { "epoch": 0.06556716117038354, "grad_norm": 0.42578125, "grad_norm_var": 0.0010772546132405598, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.4770233631134033, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1901436448097229, "step": 4391 }, { "epoch": 0.06558209334099852, "grad_norm": 0.365234375, "grad_norm_var": 0.0010776360829671224, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.8000917434692383, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1981610506772995, "step": 4392 }, { "epoch": 0.06559702551161349, "grad_norm": 0.353515625, "grad_norm_var": 0.0010776360829671224, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.6502649784088135, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18651823699474335, "step": 4393 }, { "epoch": 0.06561195768222848, "grad_norm": 0.447265625, "grad_norm_var": 0.0014120578765869141, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.4787240028381348, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.1866631656885147, "step": 4394 }, { "epoch": 0.06562688985284346, "grad_norm": 0.62890625, "grad_norm_var": 0.00518487294514974, "learning_rate": 0.0001, "loss": 1.5174, "loss/crossentropy": 2.388199210166931, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19316307455301285, "step": 4395 }, { "epoch": 0.06564182202345845, "grad_norm": 0.349609375, "grad_norm_var": 0.00503538449605306, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.3023531436920166, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19031959027051926, "step": 4396 }, { "epoch": 0.06565675419407342, "grad_norm": 0.375, "grad_norm_var": 0.005046192804972331, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.7175716161727905, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.16776369512081146, "step": 4397 }, { "epoch": 0.0656716863646884, "grad_norm": 0.408203125, "grad_norm_var": 0.004852914810180664, "learning_rate": 0.0001, "loss": 1.6006, "loss/crossentropy": 2.7201637029647827, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.23340583592653275, "step": 4398 }, { "epoch": 0.06568661853530339, "grad_norm": 0.369140625, "grad_norm_var": 0.004823795954386393, "learning_rate": 0.0001, "loss": 1.6223, "loss/crossentropy": 2.5646127462387085, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.20826680958271027, "step": 4399 }, { "epoch": 0.06570155070591836, "grad_norm": 0.38671875, "grad_norm_var": 0.004693031311035156, "learning_rate": 0.0001, "loss": 1.5453, "loss/crossentropy": 2.68288791179657, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.20155459642410278, "step": 4400 }, { "epoch": 0.06571648287653335, "grad_norm": 0.38671875, "grad_norm_var": 0.004669189453125, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.5999462604522705, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.16312573850154877, "step": 4401 }, { "epoch": 0.06573141504714833, "grad_norm": 0.47265625, "grad_norm_var": 0.00478526751200358, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.730336546897888, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18578526377677917, "step": 4402 }, { "epoch": 0.0657463472177633, "grad_norm": 0.43359375, "grad_norm_var": 0.004810444513956706, "learning_rate": 0.0001, "loss": 1.6558, "loss/crossentropy": 2.4651718139648438, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.20661813020706177, "step": 4403 }, { "epoch": 0.0657612793883783, "grad_norm": 0.380859375, "grad_norm_var": 0.004854329427083333, "learning_rate": 0.0001, "loss": 1.4785, "loss/crossentropy": 2.740939974784851, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.17385851591825485, "step": 4404 }, { "epoch": 0.06577621155899327, "grad_norm": 0.423828125, "grad_norm_var": 0.004697608947753906, "learning_rate": 0.0001, "loss": 1.6652, "loss/crossentropy": 2.6036758422851562, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.23157040774822235, "step": 4405 }, { "epoch": 0.06579114372960826, "grad_norm": 0.40625, "grad_norm_var": 0.004516029357910156, "learning_rate": 0.0001, "loss": 1.5172, "loss/crossentropy": 2.442145347595215, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.189053975045681, "step": 4406 }, { "epoch": 0.06580607590022323, "grad_norm": 0.34765625, "grad_norm_var": 0.004767799377441406, "learning_rate": 0.0001, "loss": 1.4197, "loss/crossentropy": 2.5466482639312744, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16579023748636246, "step": 4407 }, { "epoch": 0.06582100807083821, "grad_norm": 0.373046875, "grad_norm_var": 0.004726600646972656, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.446100115776062, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1872980296611786, "step": 4408 }, { "epoch": 0.0658359402414532, "grad_norm": 0.369140625, "grad_norm_var": 0.0046264012654622395, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.756406307220459, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.18139158189296722, "step": 4409 }, { "epoch": 0.06585087241206818, "grad_norm": 0.359375, "grad_norm_var": 0.004671462376912435, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.5257071256637573, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19067278504371643, "step": 4410 }, { "epoch": 0.06586580458268317, "grad_norm": 0.353515625, "grad_norm_var": 0.0011685689290364583, "learning_rate": 0.0001, "loss": 1.4544, "loss/crossentropy": 2.7008934020996094, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.18875142186880112, "step": 4411 }, { "epoch": 0.06588073675329814, "grad_norm": 0.380859375, "grad_norm_var": 0.0010729471842447916, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.6629139184951782, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1776050627231598, "step": 4412 }, { "epoch": 0.06589566892391313, "grad_norm": 0.404296875, "grad_norm_var": 0.0010712782541910807, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.481514573097229, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.21763035655021667, "step": 4413 }, { "epoch": 0.0659106010945281, "grad_norm": 0.37890625, "grad_norm_var": 0.0010576883951822916, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.629756450653076, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.20534837990999222, "step": 4414 }, { "epoch": 0.06592553326514308, "grad_norm": 0.3828125, "grad_norm_var": 0.001032876968383789, "learning_rate": 0.0001, "loss": 1.632, "loss/crossentropy": 2.7661519050598145, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2375156357884407, "step": 4415 }, { "epoch": 0.06594046543575807, "grad_norm": 0.328125, "grad_norm_var": 0.001273202896118164, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.5534183979034424, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1791011020541191, "step": 4416 }, { "epoch": 0.06595539760637305, "grad_norm": 0.31640625, "grad_norm_var": 0.0015787601470947266, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.6683578491210938, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17694168537855148, "step": 4417 }, { "epoch": 0.06597032977698804, "grad_norm": 0.3984375, "grad_norm_var": 0.0010255018870035807, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.621416926383972, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17529309540987015, "step": 4418 }, { "epoch": 0.06598526194760301, "grad_norm": 0.33984375, "grad_norm_var": 0.0008713881174723308, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.5896023511886597, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.19519003480672836, "step": 4419 }, { "epoch": 0.06600019411821799, "grad_norm": 0.357421875, "grad_norm_var": 0.0008763472239176432, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.62551212310791, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.1946812942624092, "step": 4420 }, { "epoch": 0.06601512628883298, "grad_norm": 0.404296875, "grad_norm_var": 0.0007599989573160807, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.8717963695526123, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21021585166454315, "step": 4421 }, { "epoch": 0.06603005845944795, "grad_norm": 0.62109375, "grad_norm_var": 0.004718383153279622, "learning_rate": 0.0001, "loss": 1.9264, "loss/crossentropy": 2.4998509883880615, "loss/fcd": 1.609375, "loss/idx": 9.0, "loss/logits": 0.3170690983533859, "step": 4422 }, { "epoch": 0.06604499063006294, "grad_norm": 0.4140625, "grad_norm_var": 0.004688119888305664, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.734713077545166, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20537298172712326, "step": 4423 }, { "epoch": 0.06605992280067792, "grad_norm": 0.376953125, "grad_norm_var": 0.004682143529256185, "learning_rate": 0.0001, "loss": 1.4904, "loss/crossentropy": 2.41624915599823, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18966469168663025, "step": 4424 }, { "epoch": 0.0660748549712929, "grad_norm": 0.33203125, "grad_norm_var": 0.004854583740234375, "learning_rate": 0.0001, "loss": 1.3523, "loss/crossentropy": 2.49601674079895, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.1530403271317482, "step": 4425 }, { "epoch": 0.06608978714190789, "grad_norm": 0.35546875, "grad_norm_var": 0.004868507385253906, "learning_rate": 0.0001, "loss": 1.4894, "loss/crossentropy": 2.6390066146850586, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.17686710506677628, "step": 4426 }, { "epoch": 0.06610471931252286, "grad_norm": 0.3984375, "grad_norm_var": 0.004811843236287435, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.1941637992858887, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19031628966331482, "step": 4427 }, { "epoch": 0.06611965148313785, "grad_norm": 0.349609375, "grad_norm_var": 0.004897801081339518, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.496118187904358, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.21812959760427475, "step": 4428 }, { "epoch": 0.06613458365375283, "grad_norm": 0.322265625, "grad_norm_var": 0.005106083552042643, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.4243955612182617, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1588916778564453, "step": 4429 }, { "epoch": 0.06614951582436782, "grad_norm": 0.43359375, "grad_norm_var": 0.005286773045857747, "learning_rate": 0.0001, "loss": 1.6129, "loss/crossentropy": 2.457643985748291, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.21056394279003143, "step": 4430 }, { "epoch": 0.06616444799498279, "grad_norm": 0.361328125, "grad_norm_var": 0.005316670735677083, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.587219715118408, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1629861369729042, "step": 4431 }, { "epoch": 0.06617938016559777, "grad_norm": 0.462890625, "grad_norm_var": 0.0054866631825764975, "learning_rate": 0.0001, "loss": 1.6085, "loss/crossentropy": 2.335988998413086, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.19447887688875198, "step": 4432 }, { "epoch": 0.06619431233621276, "grad_norm": 0.373046875, "grad_norm_var": 0.005129432678222657, "learning_rate": 0.0001, "loss": 1.5236, "loss/crossentropy": 2.4159224033355713, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.1837390512228012, "step": 4433 }, { "epoch": 0.06620924450682773, "grad_norm": 0.375, "grad_norm_var": 0.005149269104003906, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.4292402267456055, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.18804289400577545, "step": 4434 }, { "epoch": 0.06622417667744272, "grad_norm": 0.37890625, "grad_norm_var": 0.00497124989827474, "learning_rate": 0.0001, "loss": 1.5078, "loss/crossentropy": 2.6124294996261597, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18748703598976135, "step": 4435 }, { "epoch": 0.0662391088480577, "grad_norm": 0.384765625, "grad_norm_var": 0.004881795247395833, "learning_rate": 0.0001, "loss": 1.7504, "loss/crossentropy": 2.0578110814094543, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.21914805471897125, "step": 4436 }, { "epoch": 0.06625404101867267, "grad_norm": 0.337890625, "grad_norm_var": 0.005088233947753906, "learning_rate": 0.0001, "loss": 1.4343, "loss/crossentropy": 2.683924078941345, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.18035294115543365, "step": 4437 }, { "epoch": 0.06626897318928766, "grad_norm": 0.4140625, "grad_norm_var": 0.0014523824055989584, "learning_rate": 0.0001, "loss": 1.5583, "loss/crossentropy": 2.807352662086487, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.18325890600681305, "step": 4438 }, { "epoch": 0.06628390535990264, "grad_norm": 0.365234375, "grad_norm_var": 0.0013756910959879557, "learning_rate": 0.0001, "loss": 1.5181, "loss/crossentropy": 2.6128127574920654, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19386626780033112, "step": 4439 }, { "epoch": 0.06629883753051763, "grad_norm": 0.41796875, "grad_norm_var": 0.0014841715494791667, "learning_rate": 0.0001, "loss": 1.6744, "loss/crossentropy": 2.7568249702453613, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.2095196694135666, "step": 4440 }, { "epoch": 0.0663137697011326, "grad_norm": 0.345703125, "grad_norm_var": 0.0014104048411051433, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.732303261756897, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18580403923988342, "step": 4441 }, { "epoch": 0.06632870187174758, "grad_norm": 0.412109375, "grad_norm_var": 0.001427459716796875, "learning_rate": 0.0001, "loss": 1.6003, "loss/crossentropy": 2.481852650642395, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.2018628939986229, "step": 4442 }, { "epoch": 0.06634363404236257, "grad_norm": 0.33984375, "grad_norm_var": 0.0015237808227539062, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.5725282430648804, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.1874791830778122, "step": 4443 }, { "epoch": 0.06635856621297755, "grad_norm": 0.41796875, "grad_norm_var": 0.0015421390533447265, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.697089433670044, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.2310624122619629, "step": 4444 }, { "epoch": 0.06637349838359254, "grad_norm": 0.396484375, "grad_norm_var": 0.001276381810506185, "learning_rate": 0.0001, "loss": 1.5051, "loss/crossentropy": 2.7094842195510864, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.20044294744729996, "step": 4445 }, { "epoch": 0.06638843055420751, "grad_norm": 0.388671875, "grad_norm_var": 0.0011327107747395833, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.732586145401001, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.1641109362244606, "step": 4446 }, { "epoch": 0.06640336272482249, "grad_norm": 0.400390625, "grad_norm_var": 0.001100921630859375, "learning_rate": 0.0001, "loss": 1.6017, "loss/crossentropy": 2.7620184421539307, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.1993975192308426, "step": 4447 }, { "epoch": 0.06641829489543748, "grad_norm": 0.365234375, "grad_norm_var": 0.0007242202758789062, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.689093589782715, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.18422069400548935, "step": 4448 }, { "epoch": 0.06643322706605245, "grad_norm": 0.423828125, "grad_norm_var": 0.0008242289225260417, "learning_rate": 0.0001, "loss": 1.7611, "loss/crossentropy": 2.4600390195846558, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.26894421875476837, "step": 4449 }, { "epoch": 0.06644815923666744, "grad_norm": 0.37890625, "grad_norm_var": 0.0008198420206705729, "learning_rate": 0.0001, "loss": 1.3833, "loss/crossentropy": 2.403849244117737, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.14503557980060577, "step": 4450 }, { "epoch": 0.06646309140728242, "grad_norm": 0.376953125, "grad_norm_var": 0.0008217970530192058, "learning_rate": 0.0001, "loss": 1.5276, "loss/crossentropy": 2.515079140663147, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19947415590286255, "step": 4451 }, { "epoch": 0.06647802357789741, "grad_norm": 0.34765625, "grad_norm_var": 0.0009108861287434896, "learning_rate": 0.0001, "loss": 1.4573, "loss/crossentropy": 2.6399112939834595, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18383798003196716, "step": 4452 }, { "epoch": 0.06649295574851238, "grad_norm": 0.49609375, "grad_norm_var": 0.0015224297841389975, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.605020761489868, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.2249142974615097, "step": 4453 }, { "epoch": 0.06650788791912736, "grad_norm": 0.33984375, "grad_norm_var": 0.001657724380493164, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.4757111072540283, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17290576547384262, "step": 4454 }, { "epoch": 0.06652282008974235, "grad_norm": 0.3984375, "grad_norm_var": 0.0016244888305664063, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.7364929914474487, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.21915187686681747, "step": 4455 }, { "epoch": 0.06653775226035732, "grad_norm": 0.359375, "grad_norm_var": 0.00162353515625, "learning_rate": 0.0001, "loss": 1.4282, "loss/crossentropy": 2.710822343826294, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17818381637334824, "step": 4456 }, { "epoch": 0.06655268443097231, "grad_norm": 0.369140625, "grad_norm_var": 0.001529693603515625, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.491738796234131, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18331941962242126, "step": 4457 }, { "epoch": 0.06656761660158729, "grad_norm": 0.408203125, "grad_norm_var": 0.0015181859334309896, "learning_rate": 0.0001, "loss": 1.6907, "loss/crossentropy": 2.311042547225952, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.22587329149246216, "step": 4458 }, { "epoch": 0.06658254877220227, "grad_norm": 0.51171875, "grad_norm_var": 0.002262306213378906, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.5277539491653442, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.16578057408332825, "step": 4459 }, { "epoch": 0.06659748094281726, "grad_norm": 0.470703125, "grad_norm_var": 0.002571725845336914, "learning_rate": 0.0001, "loss": 1.647, "loss/crossentropy": 2.73630952835083, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.22126831114292145, "step": 4460 }, { "epoch": 0.06661241311343223, "grad_norm": 0.376953125, "grad_norm_var": 0.002609872817993164, "learning_rate": 0.0001, "loss": 1.6393, "loss/crossentropy": 2.6061935424804688, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2212827503681183, "step": 4461 }, { "epoch": 0.06662734528404722, "grad_norm": 0.3515625, "grad_norm_var": 0.0027557373046875, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.661167621612549, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.16557656228542328, "step": 4462 }, { "epoch": 0.0666422774546622, "grad_norm": 0.376953125, "grad_norm_var": 0.002783966064453125, "learning_rate": 0.0001, "loss": 1.4759, "loss/crossentropy": 2.417749047279358, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1868002563714981, "step": 4463 }, { "epoch": 0.06665720962527717, "grad_norm": 0.40234375, "grad_norm_var": 0.00271299680074056, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.5749553442001343, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18296603858470917, "step": 4464 }, { "epoch": 0.06667214179589216, "grad_norm": 0.349609375, "grad_norm_var": 0.002814467748006185, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.4319084882736206, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.19057178497314453, "step": 4465 }, { "epoch": 0.06668707396650714, "grad_norm": 0.34765625, "grad_norm_var": 0.0029411156972249347, "learning_rate": 0.0001, "loss": 1.4847, "loss/crossentropy": 2.5339869260787964, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18005642294883728, "step": 4466 }, { "epoch": 0.06670200613712213, "grad_norm": 0.400390625, "grad_norm_var": 0.0029262383778889973, "learning_rate": 0.0001, "loss": 1.5065, "loss/crossentropy": 2.7944620847702026, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1900995373725891, "step": 4467 }, { "epoch": 0.0667169383077371, "grad_norm": 0.47265625, "grad_norm_var": 0.0031276543935139975, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.728528618812561, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.2148180603981018, "step": 4468 }, { "epoch": 0.06673187047835208, "grad_norm": 0.337890625, "grad_norm_var": 0.0027066548665364582, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.3999887704849243, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.1631414219737053, "step": 4469 }, { "epoch": 0.06674680264896707, "grad_norm": 0.37890625, "grad_norm_var": 0.0025299072265625, "learning_rate": 0.0001, "loss": 1.74, "loss/crossentropy": 2.496440052986145, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.2790282517671585, "step": 4470 }, { "epoch": 0.06676173481958204, "grad_norm": 0.470703125, "grad_norm_var": 0.002893940607706706, "learning_rate": 0.0001, "loss": 1.628, "loss/crossentropy": 2.4856079816818237, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.20219217240810394, "step": 4471 }, { "epoch": 0.06677666699019703, "grad_norm": 0.490234375, "grad_norm_var": 0.0032719930013020834, "learning_rate": 0.0001, "loss": 1.6857, "loss/crossentropy": 2.7277190685272217, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.2598995044827461, "step": 4472 }, { "epoch": 0.06679159916081201, "grad_norm": 0.36328125, "grad_norm_var": 0.0033038934071858723, "learning_rate": 0.0001, "loss": 1.5277, "loss/crossentropy": 2.5999443531036377, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.19178827106952667, "step": 4473 }, { "epoch": 0.066806531331427, "grad_norm": 0.375, "grad_norm_var": 0.003366851806640625, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.591488480567932, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18553613126277924, "step": 4474 }, { "epoch": 0.06682146350204198, "grad_norm": 0.494140625, "grad_norm_var": 0.003135538101196289, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.5278271436691284, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16351444274187088, "step": 4475 }, { "epoch": 0.06683639567265695, "grad_norm": 0.38671875, "grad_norm_var": 0.002825927734375, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.505910873413086, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.16890953481197357, "step": 4476 }, { "epoch": 0.06685132784327194, "grad_norm": 0.353515625, "grad_norm_var": 0.002927398681640625, "learning_rate": 0.0001, "loss": 1.4661, "loss/crossentropy": 2.540330410003662, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18095871806144714, "step": 4477 }, { "epoch": 0.06686626001388692, "grad_norm": 0.41796875, "grad_norm_var": 0.0028009414672851562, "learning_rate": 0.0001, "loss": 1.8233, "loss/crossentropy": 2.569303870201111, "loss/fcd": 1.5859375, "loss/idx": 9.0, "loss/logits": 0.2373943030834198, "step": 4478 }, { "epoch": 0.0668811921845019, "grad_norm": 0.37109375, "grad_norm_var": 0.0028219699859619142, "learning_rate": 0.0001, "loss": 1.4554, "loss/crossentropy": 2.6518474817276, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17808758467435837, "step": 4479 }, { "epoch": 0.06689612435511688, "grad_norm": 0.37109375, "grad_norm_var": 0.002876393000284831, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.689728021621704, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.16900113970041275, "step": 4480 }, { "epoch": 0.06691105652573186, "grad_norm": 0.443359375, "grad_norm_var": 0.0028107802073160807, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.7594648599624634, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20197799801826477, "step": 4481 }, { "epoch": 0.06692598869634685, "grad_norm": 0.375, "grad_norm_var": 0.0026496728261311848, "learning_rate": 0.0001, "loss": 1.6184, "loss/crossentropy": 2.5189491510391235, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2277872934937477, "step": 4482 }, { "epoch": 0.06694092086696182, "grad_norm": 0.498046875, "grad_norm_var": 0.0031678358713785808, "learning_rate": 0.0001, "loss": 1.779, "loss/crossentropy": 2.953598141670227, "loss/fcd": 1.5078125, "loss/idx": 9.0, "loss/logits": 0.27117667347192764, "step": 4483 }, { "epoch": 0.06695585303757681, "grad_norm": 0.375, "grad_norm_var": 0.0029802799224853517, "learning_rate": 0.0001, "loss": 1.5948, "loss/crossentropy": 2.621534824371338, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.21587764471769333, "step": 4484 }, { "epoch": 0.06697078520819179, "grad_norm": 0.404296875, "grad_norm_var": 0.002649545669555664, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.591889500617981, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1937136873602867, "step": 4485 }, { "epoch": 0.06698571737880676, "grad_norm": 0.328125, "grad_norm_var": 0.0030247847239176433, "learning_rate": 0.0001, "loss": 1.411, "loss/crossentropy": 2.8237913846969604, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.17267194390296936, "step": 4486 }, { "epoch": 0.06700064954942175, "grad_norm": 0.3515625, "grad_norm_var": 0.0029055277506510415, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.710355520248413, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.18896190077066422, "step": 4487 }, { "epoch": 0.06701558172003673, "grad_norm": 0.390625, "grad_norm_var": 0.002325932184855143, "learning_rate": 0.0001, "loss": 1.4662, "loss/crossentropy": 2.510976195335388, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.16930440068244934, "step": 4488 }, { "epoch": 0.06703051389065172, "grad_norm": 0.4140625, "grad_norm_var": 0.002281300226847331, "learning_rate": 0.0001, "loss": 1.7078, "loss/crossentropy": 2.621753454208374, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.24293141812086105, "step": 4489 }, { "epoch": 0.0670454460612667, "grad_norm": 0.3671875, "grad_norm_var": 0.0023078759511311847, "learning_rate": 0.0001, "loss": 1.5687, "loss/crossentropy": 2.6313599348068237, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.21716447919607162, "step": 4490 }, { "epoch": 0.06706037823188168, "grad_norm": 0.42578125, "grad_norm_var": 0.0017087300618489583, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.860509514808655, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19095420837402344, "step": 4491 }, { "epoch": 0.06707531040249666, "grad_norm": 0.36328125, "grad_norm_var": 0.0017598470052083333, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.468935489654541, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.17470353841781616, "step": 4492 }, { "epoch": 0.06709024257311164, "grad_norm": 0.35546875, "grad_norm_var": 0.0017504215240478516, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.549775004386902, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.19032415002584457, "step": 4493 }, { "epoch": 0.06710517474372663, "grad_norm": 0.447265625, "grad_norm_var": 0.001910400390625, "learning_rate": 0.0001, "loss": 1.9371, "loss/crossentropy": 2.457066535949707, "loss/fcd": 1.6640625, "loss/idx": 9.0, "loss/logits": 0.2730548083782196, "step": 4494 }, { "epoch": 0.0671201069143416, "grad_norm": 0.400390625, "grad_norm_var": 0.0018801212310791016, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.3641602993011475, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.192282572388649, "step": 4495 }, { "epoch": 0.06713503908495659, "grad_norm": 0.40625, "grad_norm_var": 0.0018480777740478515, "learning_rate": 0.0001, "loss": 1.6898, "loss/crossentropy": 2.342658042907715, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.24055421352386475, "step": 4496 }, { "epoch": 0.06714997125557157, "grad_norm": 0.4375, "grad_norm_var": 0.0018136978149414062, "learning_rate": 0.0001, "loss": 1.5069, "loss/crossentropy": 2.6618106365203857, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1905006319284439, "step": 4497 }, { "epoch": 0.06716490342618654, "grad_norm": 0.3828125, "grad_norm_var": 0.0017953872680664062, "learning_rate": 0.0001, "loss": 1.714, "loss/crossentropy": 2.2305068969726562, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.20229707658290863, "step": 4498 }, { "epoch": 0.06717983559680153, "grad_norm": 0.361328125, "grad_norm_var": 0.0011166890462239584, "learning_rate": 0.0001, "loss": 1.5694, "loss/crossentropy": 2.6223180294036865, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2021799087524414, "step": 4499 }, { "epoch": 0.06719476776741651, "grad_norm": 0.3984375, "grad_norm_var": 0.0011098225911458333, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.773563504219055, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18817325681447983, "step": 4500 }, { "epoch": 0.0672096999380315, "grad_norm": 0.341796875, "grad_norm_var": 0.0012318929036458333, "learning_rate": 0.0001, "loss": 1.4586, "loss/crossentropy": 2.5603229999542236, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18129803240299225, "step": 4501 }, { "epoch": 0.06722463210864647, "grad_norm": 0.341796875, "grad_norm_var": 0.0011385440826416015, "learning_rate": 0.0001, "loss": 1.5283, "loss/crossentropy": 2.501080274581909, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19622090458869934, "step": 4502 }, { "epoch": 0.06723956427926145, "grad_norm": 0.365234375, "grad_norm_var": 0.0010863622029622397, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.598511219024658, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.2032720446586609, "step": 4503 }, { "epoch": 0.06725449644987644, "grad_norm": 0.44921875, "grad_norm_var": 0.0013257344563802084, "learning_rate": 0.0001, "loss": 1.5568, "loss/crossentropy": 2.5575881004333496, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19353000819683075, "step": 4504 }, { "epoch": 0.06726942862049141, "grad_norm": 0.41015625, "grad_norm_var": 0.0013147354125976562, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.473353385925293, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.204009048640728, "step": 4505 }, { "epoch": 0.0672843607911064, "grad_norm": 0.392578125, "grad_norm_var": 0.0012748559315999349, "learning_rate": 0.0001, "loss": 1.6792, "loss/crossentropy": 2.3726000785827637, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.22220782190561295, "step": 4506 }, { "epoch": 0.06729929296172138, "grad_norm": 0.392578125, "grad_norm_var": 0.0011962254842122395, "learning_rate": 0.0001, "loss": 1.5557, "loss/crossentropy": 2.635146141052246, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19634570181369781, "step": 4507 }, { "epoch": 0.06731422513233636, "grad_norm": 0.373046875, "grad_norm_var": 0.0011668999989827473, "learning_rate": 0.0001, "loss": 1.5895, "loss/crossentropy": 2.6299010515213013, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2262435108423233, "step": 4508 }, { "epoch": 0.06732915730295135, "grad_norm": 0.66796875, "grad_norm_var": 0.005790313084920247, "learning_rate": 0.0001, "loss": 1.9355, "loss/crossentropy": 2.638217568397522, "loss/fcd": 1.6484375, "loss/idx": 9.0, "loss/logits": 0.2871088683605194, "step": 4509 }, { "epoch": 0.06734408947356632, "grad_norm": 0.3984375, "grad_norm_var": 0.005700111389160156, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.4687405824661255, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.19128963351249695, "step": 4510 }, { "epoch": 0.06735902164418131, "grad_norm": 0.357421875, "grad_norm_var": 0.005856068929036459, "learning_rate": 0.0001, "loss": 1.4203, "loss/crossentropy": 2.586880326271057, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17027059197425842, "step": 4511 }, { "epoch": 0.06737395381479629, "grad_norm": 0.37109375, "grad_norm_var": 0.005926450093587239, "learning_rate": 0.0001, "loss": 1.4558, "loss/crossentropy": 2.512851595878601, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18239137530326843, "step": 4512 }, { "epoch": 0.06738888598541128, "grad_norm": 0.408203125, "grad_norm_var": 0.005843718846638997, "learning_rate": 0.0001, "loss": 1.6619, "loss/crossentropy": 2.3213582038879395, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2283364087343216, "step": 4513 }, { "epoch": 0.06740381815602625, "grad_norm": 0.45703125, "grad_norm_var": 0.00601042111714681, "learning_rate": 0.0001, "loss": 1.7536, "loss/crossentropy": 2.8403165340423584, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.25356607139110565, "step": 4514 }, { "epoch": 0.06741875032664123, "grad_norm": 0.423828125, "grad_norm_var": 0.005887333552042643, "learning_rate": 0.0001, "loss": 1.5507, "loss/crossentropy": 2.5676146745681763, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1913641095161438, "step": 4515 }, { "epoch": 0.06743368249725622, "grad_norm": 0.33984375, "grad_norm_var": 0.006186787287394206, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.440189480781555, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19088593125343323, "step": 4516 }, { "epoch": 0.06744861466787119, "grad_norm": 0.421875, "grad_norm_var": 0.005905914306640625, "learning_rate": 0.0001, "loss": 1.599, "loss/crossentropy": 2.338878870010376, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20442818105220795, "step": 4517 }, { "epoch": 0.06746354683848618, "grad_norm": 0.404296875, "grad_norm_var": 0.005576324462890625, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.5799248218536377, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.18834006041288376, "step": 4518 }, { "epoch": 0.06747847900910116, "grad_norm": 0.41796875, "grad_norm_var": 0.005403375625610352, "learning_rate": 0.0001, "loss": 1.6179, "loss/crossentropy": 2.8700655698776245, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.2194286286830902, "step": 4519 }, { "epoch": 0.06749341117971613, "grad_norm": 0.388671875, "grad_norm_var": 0.005379231770833334, "learning_rate": 0.0001, "loss": 1.6574, "loss/crossentropy": 2.4818440675735474, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.23161377012729645, "step": 4520 }, { "epoch": 0.06750834335033112, "grad_norm": 0.3671875, "grad_norm_var": 0.005517005920410156, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.470817804336548, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.20843684673309326, "step": 4521 }, { "epoch": 0.0675232755209461, "grad_norm": 0.34765625, "grad_norm_var": 0.005755726496378581, "learning_rate": 0.0001, "loss": 1.4403, "loss/crossentropy": 2.5768944025039673, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18246019631624222, "step": 4522 }, { "epoch": 0.06753820769156109, "grad_norm": 0.416015625, "grad_norm_var": 0.0057400862375895185, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.3584569692611694, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2140454202890396, "step": 4523 }, { "epoch": 0.06755313986217606, "grad_norm": 0.365234375, "grad_norm_var": 0.005782429377237956, "learning_rate": 0.0001, "loss": 1.4068, "loss/crossentropy": 2.844480514526367, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16459456831216812, "step": 4524 }, { "epoch": 0.06756807203279104, "grad_norm": 0.326171875, "grad_norm_var": 0.001306915283203125, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.517199397087097, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18469516187906265, "step": 4525 }, { "epoch": 0.06758300420340603, "grad_norm": 0.36328125, "grad_norm_var": 0.0013360977172851562, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.463915467262268, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.18577852845191956, "step": 4526 }, { "epoch": 0.067597936374021, "grad_norm": 0.369140625, "grad_norm_var": 0.001300048828125, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.7258172035217285, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18383952230215073, "step": 4527 }, { "epoch": 0.067612868544636, "grad_norm": 0.37109375, "grad_norm_var": 0.001300048828125, "learning_rate": 0.0001, "loss": 1.4745, "loss/crossentropy": 2.5182923078536987, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.1854826658964157, "step": 4528 }, { "epoch": 0.06762780071525097, "grad_norm": 0.455078125, "grad_norm_var": 0.0015716552734375, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.682167649269104, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18894869089126587, "step": 4529 }, { "epoch": 0.06764273288586595, "grad_norm": 0.3515625, "grad_norm_var": 0.0013193130493164063, "learning_rate": 0.0001, "loss": 1.4424, "loss/crossentropy": 2.71930730342865, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17674120515584946, "step": 4530 }, { "epoch": 0.06765766505648094, "grad_norm": 0.337890625, "grad_norm_var": 0.0013137181599934896, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.5693517923355103, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.20657986402511597, "step": 4531 }, { "epoch": 0.06767259722709591, "grad_norm": 0.36328125, "grad_norm_var": 0.0012297948201497396, "learning_rate": 0.0001, "loss": 1.4224, "loss/crossentropy": 2.7721364498138428, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16849443316459656, "step": 4532 }, { "epoch": 0.0676875293977109, "grad_norm": 0.41796875, "grad_norm_var": 0.00120849609375, "learning_rate": 0.0001, "loss": 1.5709, "loss/crossentropy": 2.854246735572815, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1998082771897316, "step": 4533 }, { "epoch": 0.06770246156832588, "grad_norm": 0.36328125, "grad_norm_var": 0.001174783706665039, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.5616018772125244, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.21486225724220276, "step": 4534 }, { "epoch": 0.06771739373894087, "grad_norm": 0.404296875, "grad_norm_var": 0.0011105855305989583, "learning_rate": 0.0001, "loss": 1.5671, "loss/crossentropy": 2.4521700143814087, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1960131898522377, "step": 4535 }, { "epoch": 0.06773232590955584, "grad_norm": 0.349609375, "grad_norm_var": 0.0011372884114583333, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.5953547954559326, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.1837102547287941, "step": 4536 }, { "epoch": 0.06774725808017082, "grad_norm": 0.41796875, "grad_norm_var": 0.0012587865193684896, "learning_rate": 0.0001, "loss": 1.6463, "loss/crossentropy": 2.8545799255371094, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.23614482581615448, "step": 4537 }, { "epoch": 0.06776219025078581, "grad_norm": 0.34375, "grad_norm_var": 0.0012746175130208333, "learning_rate": 0.0001, "loss": 1.3642, "loss/crossentropy": 2.5389946699142456, "loss/fcd": 1.20703125, "loss/idx": 9.0, "loss/logits": 0.15720468014478683, "step": 4538 }, { "epoch": 0.06777712242140078, "grad_norm": 0.369140625, "grad_norm_var": 0.0011617024739583334, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.5452351570129395, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20206767320632935, "step": 4539 }, { "epoch": 0.06779205459201577, "grad_norm": 0.357421875, "grad_norm_var": 0.0011736551920572917, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.6413652896881104, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19318418204784393, "step": 4540 }, { "epoch": 0.06780698676263075, "grad_norm": 0.40234375, "grad_norm_var": 0.0010651747385660808, "learning_rate": 0.0001, "loss": 1.5416, "loss/crossentropy": 2.873280644416809, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19786924868822098, "step": 4541 }, { "epoch": 0.06782191893324573, "grad_norm": 0.330078125, "grad_norm_var": 0.0011962254842122395, "learning_rate": 0.0001, "loss": 1.4038, "loss/crossentropy": 2.6572659015655518, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16162671148777008, "step": 4542 }, { "epoch": 0.06783685110386072, "grad_norm": 0.3828125, "grad_norm_var": 0.0011967817942301433, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.437599301338196, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.17926901578903198, "step": 4543 }, { "epoch": 0.06785178327447569, "grad_norm": 0.33203125, "grad_norm_var": 0.001318216323852539, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.4329946041107178, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1705121472477913, "step": 4544 }, { "epoch": 0.06786671544509068, "grad_norm": 0.41796875, "grad_norm_var": 0.0010014216105143229, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.7131705284118652, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2019807994365692, "step": 4545 }, { "epoch": 0.06788164761570566, "grad_norm": 0.369140625, "grad_norm_var": 0.0009743849436442058, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.651752471923828, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.1921444982290268, "step": 4546 }, { "epoch": 0.06789657978632063, "grad_norm": 0.359375, "grad_norm_var": 0.0009042739868164063, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.37518572807312, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.19114705175161362, "step": 4547 }, { "epoch": 0.06791151195693562, "grad_norm": 0.326171875, "grad_norm_var": 0.0010422865549723308, "learning_rate": 0.0001, "loss": 1.4479, "loss/crossentropy": 2.631251096725464, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17834049463272095, "step": 4548 }, { "epoch": 0.0679264441275506, "grad_norm": 0.41796875, "grad_norm_var": 0.0010422865549723308, "learning_rate": 0.0001, "loss": 1.7073, "loss/crossentropy": 2.3306314945220947, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.21514896303415298, "step": 4549 }, { "epoch": 0.06794137629816559, "grad_norm": 0.380859375, "grad_norm_var": 0.0010424296061197916, "learning_rate": 0.0001, "loss": 1.5283, "loss/crossentropy": 2.515547037124634, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20020316541194916, "step": 4550 }, { "epoch": 0.06795630846878056, "grad_norm": 0.357421875, "grad_norm_var": 0.0009813944498697916, "learning_rate": 0.0001, "loss": 1.6122, "loss/crossentropy": 2.549039602279663, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2215740829706192, "step": 4551 }, { "epoch": 0.06797124063939555, "grad_norm": 0.373046875, "grad_norm_var": 0.0009531656901041667, "learning_rate": 0.0001, "loss": 1.4413, "loss/crossentropy": 2.746061086654663, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1756558045744896, "step": 4552 }, { "epoch": 0.06798617281001053, "grad_norm": 0.373046875, "grad_norm_var": 0.0007985273996988933, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.4302440881729126, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.2151578888297081, "step": 4553 }, { "epoch": 0.0680011049806255, "grad_norm": 0.333984375, "grad_norm_var": 0.0008364359537760417, "learning_rate": 0.0001, "loss": 1.4231, "loss/crossentropy": 2.5865461826324463, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.18090391159057617, "step": 4554 }, { "epoch": 0.0680160371512405, "grad_norm": 0.376953125, "grad_norm_var": 0.0008417765299479166, "learning_rate": 0.0001, "loss": 1.4413, "loss/crossentropy": 2.7521517276763916, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.18734703212976456, "step": 4555 }, { "epoch": 0.06803096932185547, "grad_norm": 0.40625, "grad_norm_var": 0.0009208520253499349, "learning_rate": 0.0001, "loss": 1.6183, "loss/crossentropy": 2.414618492126465, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.20033299177885056, "step": 4556 }, { "epoch": 0.06804590149247046, "grad_norm": 0.33984375, "grad_norm_var": 0.0009055932362874349, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.568824052810669, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.18722276389598846, "step": 4557 }, { "epoch": 0.06806083366308543, "grad_norm": 0.375, "grad_norm_var": 0.0008087158203125, "learning_rate": 0.0001, "loss": 1.6685, "loss/crossentropy": 2.3869731426239014, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.22705822438001633, "step": 4558 }, { "epoch": 0.06807576583370041, "grad_norm": 0.333984375, "grad_norm_var": 0.0008750756581624349, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.533425211906433, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.17864182591438293, "step": 4559 }, { "epoch": 0.0680906980043154, "grad_norm": 0.3671875, "grad_norm_var": 0.0007881005605061849, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.7064393758773804, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.19848761707544327, "step": 4560 }, { "epoch": 0.06810563017493038, "grad_norm": 0.38671875, "grad_norm_var": 0.0006461938222249349, "learning_rate": 0.0001, "loss": 1.5855, "loss/crossentropy": 2.858240246772766, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.22222764790058136, "step": 4561 }, { "epoch": 0.06812056234554537, "grad_norm": 0.392578125, "grad_norm_var": 0.0006862481435139974, "learning_rate": 0.0001, "loss": 1.6671, "loss/crossentropy": 2.7093539237976074, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.23349890112876892, "step": 4562 }, { "epoch": 0.06813549451616034, "grad_norm": 0.337890625, "grad_norm_var": 0.000742022196451823, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.831666946411133, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.1665544956922531, "step": 4563 }, { "epoch": 0.06815042668677532, "grad_norm": 0.3515625, "grad_norm_var": 0.0006426334381103516, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.7303733825683594, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18891702592372894, "step": 4564 }, { "epoch": 0.0681653588573903, "grad_norm": 0.31640625, "grad_norm_var": 0.0006244500478108724, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.503404378890991, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16488832235336304, "step": 4565 }, { "epoch": 0.06818029102800528, "grad_norm": 0.33984375, "grad_norm_var": 0.0006301244099934896, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.6416099071502686, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16940448433160782, "step": 4566 }, { "epoch": 0.06819522319862027, "grad_norm": 0.31640625, "grad_norm_var": 0.0007499535878499349, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.5484213829040527, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.20205402374267578, "step": 4567 }, { "epoch": 0.06821015536923525, "grad_norm": 0.515625, "grad_norm_var": 0.0023152033487955728, "learning_rate": 0.0001, "loss": 1.5666, "loss/crossentropy": 2.181017279624939, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.17595041543245316, "step": 4568 }, { "epoch": 0.06822508753985022, "grad_norm": 0.380859375, "grad_norm_var": 0.002325884501139323, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.5971362590789795, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1719416305422783, "step": 4569 }, { "epoch": 0.06824001971046521, "grad_norm": 0.322265625, "grad_norm_var": 0.0023859659830729168, "learning_rate": 0.0001, "loss": 1.5804, "loss/crossentropy": 2.734256863594055, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.22490499913692474, "step": 4570 }, { "epoch": 0.06825495188108019, "grad_norm": 0.38671875, "grad_norm_var": 0.0024059136708577474, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.330000400543213, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.1847190484404564, "step": 4571 }, { "epoch": 0.06826988405169518, "grad_norm": 0.50390625, "grad_norm_var": 0.0035153547922770183, "learning_rate": 0.0001, "loss": 1.8427, "loss/crossentropy": 2.5393627882003784, "loss/fcd": 1.609375, "loss/idx": 9.0, "loss/logits": 0.23337142914533615, "step": 4572 }, { "epoch": 0.06828481622231015, "grad_norm": 0.44921875, "grad_norm_var": 0.0037806034088134766, "learning_rate": 0.0001, "loss": 1.672, "loss/crossentropy": 2.4964499473571777, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.21499865502119064, "step": 4573 }, { "epoch": 0.06829974839292514, "grad_norm": 0.35546875, "grad_norm_var": 0.003816843032836914, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.6180487871170044, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.1911042332649231, "step": 4574 }, { "epoch": 0.06831468056354012, "grad_norm": 0.40625, "grad_norm_var": 0.0037139256795247397, "learning_rate": 0.0001, "loss": 1.6302, "loss/crossentropy": 2.4736047983169556, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.20055948197841644, "step": 4575 }, { "epoch": 0.0683296127341551, "grad_norm": 0.365234375, "grad_norm_var": 0.003718296686808268, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.67493212223053, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.18241055309772491, "step": 4576 }, { "epoch": 0.06834454490477009, "grad_norm": 0.400390625, "grad_norm_var": 0.00373687744140625, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.8258554935455322, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17550291121006012, "step": 4577 }, { "epoch": 0.06835947707538506, "grad_norm": 0.36328125, "grad_norm_var": 0.0037561893463134766, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.688049077987671, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.1874322071671486, "step": 4578 }, { "epoch": 0.06837440924600005, "grad_norm": 0.462890625, "grad_norm_var": 0.003998295466105143, "learning_rate": 0.0001, "loss": 1.7398, "loss/crossentropy": 2.569748282432556, "loss/fcd": 1.484375, "loss/idx": 9.0, "loss/logits": 0.25547094643116, "step": 4579 }, { "epoch": 0.06838934141661503, "grad_norm": 0.45703125, "grad_norm_var": 0.004156223932902018, "learning_rate": 0.0001, "loss": 1.6945, "loss/crossentropy": 2.6557313203811646, "loss/fcd": 1.4609375, "loss/idx": 9.0, "loss/logits": 0.23353803157806396, "step": 4580 }, { "epoch": 0.06840427358723, "grad_norm": 0.404296875, "grad_norm_var": 0.0037020365397135418, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.5216983556747437, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.19841821491718292, "step": 4581 }, { "epoch": 0.06841920575784499, "grad_norm": 0.349609375, "grad_norm_var": 0.0036272525787353514, "learning_rate": 0.0001, "loss": 1.5772, "loss/crossentropy": 2.4618078470230103, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.2099900022149086, "step": 4582 }, { "epoch": 0.06843413792845997, "grad_norm": 0.390625, "grad_norm_var": 0.0031198978424072264, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.616675615310669, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.16986532509326935, "step": 4583 }, { "epoch": 0.06844907009907496, "grad_norm": 0.408203125, "grad_norm_var": 0.0022867838541666668, "learning_rate": 0.0001, "loss": 1.5534, "loss/crossentropy": 2.550185203552246, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.2018590047955513, "step": 4584 }, { "epoch": 0.06846400226968993, "grad_norm": 0.365234375, "grad_norm_var": 0.0023427327473958333, "learning_rate": 0.0001, "loss": 1.509, "loss/crossentropy": 2.5918368101119995, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18480630218982697, "step": 4585 }, { "epoch": 0.06847893444030491, "grad_norm": 0.330078125, "grad_norm_var": 0.002266184488932292, "learning_rate": 0.0001, "loss": 1.4248, "loss/crossentropy": 2.436041474342346, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.15921074897050858, "step": 4586 }, { "epoch": 0.0684938666109199, "grad_norm": 0.328125, "grad_norm_var": 0.002583758036295573, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.4943528175354004, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.1836528182029724, "step": 4587 }, { "epoch": 0.06850879878153487, "grad_norm": 0.384765625, "grad_norm_var": 0.0017605940500895181, "learning_rate": 0.0001, "loss": 1.5146, "loss/crossentropy": 2.555799961090088, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18256353586912155, "step": 4588 }, { "epoch": 0.06852373095214986, "grad_norm": 0.36328125, "grad_norm_var": 0.0015298048655192056, "learning_rate": 0.0001, "loss": 1.5002, "loss/crossentropy": 2.5029983520507812, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.18379847705364227, "step": 4589 }, { "epoch": 0.06853866312276484, "grad_norm": 0.419921875, "grad_norm_var": 0.001549212137858073, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.6374967098236084, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.18302995711565018, "step": 4590 }, { "epoch": 0.06855359529337982, "grad_norm": 0.388671875, "grad_norm_var": 0.0015244642893473307, "learning_rate": 0.0001, "loss": 1.6778, "loss/crossentropy": 2.682401657104492, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.24032945930957794, "step": 4591 }, { "epoch": 0.0685685274639948, "grad_norm": 0.3984375, "grad_norm_var": 0.0014998753865559896, "learning_rate": 0.0001, "loss": 1.5122, "loss/crossentropy": 2.4149489402770996, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1996963694691658, "step": 4592 }, { "epoch": 0.06858345963460978, "grad_norm": 0.4140625, "grad_norm_var": 0.001533365249633789, "learning_rate": 0.0001, "loss": 1.6627, "loss/crossentropy": 2.504308581352234, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.21348819881677628, "step": 4593 }, { "epoch": 0.06859839180522477, "grad_norm": 0.328125, "grad_norm_var": 0.0017324924468994141, "learning_rate": 0.0001, "loss": 1.5371, "loss/crossentropy": 2.476078510284424, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.20899958163499832, "step": 4594 }, { "epoch": 0.06861332397583975, "grad_norm": 0.404296875, "grad_norm_var": 0.0013548374176025391, "learning_rate": 0.0001, "loss": 1.548, "loss/crossentropy": 2.8343772888183594, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1925113946199417, "step": 4595 }, { "epoch": 0.06862825614645474, "grad_norm": 0.39453125, "grad_norm_var": 0.000985574722290039, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.4302698373794556, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.16253376007080078, "step": 4596 }, { "epoch": 0.06864318831706971, "grad_norm": 0.48046875, "grad_norm_var": 0.001599884033203125, "learning_rate": 0.0001, "loss": 1.5798, "loss/crossentropy": 2.739599108695984, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2086574137210846, "step": 4597 }, { "epoch": 0.06865812048768469, "grad_norm": 0.380859375, "grad_norm_var": 0.0015164693196614584, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.4845714569091797, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19058270752429962, "step": 4598 }, { "epoch": 0.06867305265829968, "grad_norm": 0.35546875, "grad_norm_var": 0.0015731175740559896, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.641726851463318, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.1745588332414627, "step": 4599 }, { "epoch": 0.06868798482891465, "grad_norm": 0.42578125, "grad_norm_var": 0.0016490777333577475, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.609360694885254, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.1804669424891472, "step": 4600 }, { "epoch": 0.06870291699952964, "grad_norm": 0.3671875, "grad_norm_var": 0.001644134521484375, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.5753320455551147, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.1988794207572937, "step": 4601 }, { "epoch": 0.06871784917014462, "grad_norm": 0.365234375, "grad_norm_var": 0.0014627456665039062, "learning_rate": 0.0001, "loss": 1.5847, "loss/crossentropy": 2.4842751026153564, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.19794809818267822, "step": 4602 }, { "epoch": 0.0687327813407596, "grad_norm": 0.41796875, "grad_norm_var": 0.001256561279296875, "learning_rate": 0.0001, "loss": 1.5835, "loss/crossentropy": 2.673075318336487, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.21630704402923584, "step": 4603 }, { "epoch": 0.06874771351137458, "grad_norm": 0.478515625, "grad_norm_var": 0.001702117919921875, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.5170347690582275, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.24674518406391144, "step": 4604 }, { "epoch": 0.06876264568198956, "grad_norm": 0.34765625, "grad_norm_var": 0.0017916361490885417, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.6309698820114136, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17540766298770905, "step": 4605 }, { "epoch": 0.06877757785260455, "grad_norm": 0.3828125, "grad_norm_var": 0.0017689863840738933, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.5592331886291504, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1912199854850769, "step": 4606 }, { "epoch": 0.06879251002321952, "grad_norm": 0.36328125, "grad_norm_var": 0.0018328348795572916, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.6395992040634155, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.179592564702034, "step": 4607 }, { "epoch": 0.0688074421938345, "grad_norm": 0.376953125, "grad_norm_var": 0.0018490950266520183, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.70712149143219, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16052880883216858, "step": 4608 }, { "epoch": 0.06882237436444949, "grad_norm": 0.3828125, "grad_norm_var": 0.0018211205800374348, "learning_rate": 0.0001, "loss": 1.5524, "loss/crossentropy": 2.5407330989837646, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.20081330835819244, "step": 4609 }, { "epoch": 0.06883730653506447, "grad_norm": 0.419921875, "grad_norm_var": 0.0015813191731770834, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.732727289199829, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17823857814073563, "step": 4610 }, { "epoch": 0.06885223870567946, "grad_norm": 0.36328125, "grad_norm_var": 0.0016437371571858723, "learning_rate": 0.0001, "loss": 1.5528, "loss/crossentropy": 2.5579839944839478, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.2090889886021614, "step": 4611 }, { "epoch": 0.06886717087629443, "grad_norm": 0.42578125, "grad_norm_var": 0.001707315444946289, "learning_rate": 0.0001, "loss": 1.6901, "loss/crossentropy": 2.5170576572418213, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.21744544059038162, "step": 4612 }, { "epoch": 0.06888210304690942, "grad_norm": 0.365234375, "grad_norm_var": 0.00123748779296875, "learning_rate": 0.0001, "loss": 1.5892, "loss/crossentropy": 2.4401012659072876, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20643237233161926, "step": 4613 }, { "epoch": 0.0688970352175244, "grad_norm": 0.365234375, "grad_norm_var": 0.0012690226236979167, "learning_rate": 0.0001, "loss": 1.7443, "loss/crossentropy": 2.4860368967056274, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.2521332651376724, "step": 4614 }, { "epoch": 0.06891196738813937, "grad_norm": 0.416015625, "grad_norm_var": 0.0012379805246988931, "learning_rate": 0.0001, "loss": 1.6448, "loss/crossentropy": 2.5056443214416504, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.21120373904705048, "step": 4615 }, { "epoch": 0.06892689955875436, "grad_norm": 0.40234375, "grad_norm_var": 0.0011651198069254558, "learning_rate": 0.0001, "loss": 1.7286, "loss/crossentropy": 2.7472606897354126, "loss/fcd": 1.48046875, "loss/idx": 9.0, "loss/logits": 0.24817392975091934, "step": 4616 }, { "epoch": 0.06894183172936934, "grad_norm": 0.34375, "grad_norm_var": 0.0012707869211832682, "learning_rate": 0.0001, "loss": 1.4535, "loss/crossentropy": 2.4711239337921143, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.16836710274219513, "step": 4617 }, { "epoch": 0.06895676389998433, "grad_norm": 0.388671875, "grad_norm_var": 0.0012322584788004556, "learning_rate": 0.0001, "loss": 1.6103, "loss/crossentropy": 2.642130970954895, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.2235317975282669, "step": 4618 }, { "epoch": 0.0689716960705993, "grad_norm": 0.330078125, "grad_norm_var": 0.0013874689737955728, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.599454164505005, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.16486650705337524, "step": 4619 }, { "epoch": 0.06898662824121428, "grad_norm": 0.375, "grad_norm_var": 0.0007598718007405599, "learning_rate": 0.0001, "loss": 1.5618, "loss/crossentropy": 2.811405062675476, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19465775787830353, "step": 4620 }, { "epoch": 0.06900156041182927, "grad_norm": 0.322265625, "grad_norm_var": 0.0009030659993489583, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.690198302268982, "loss/fcd": 1.2109375, "loss/idx": 9.0, "loss/logits": 0.15808268636465073, "step": 4621 }, { "epoch": 0.06901649258244424, "grad_norm": 0.357421875, "grad_norm_var": 0.0009218692779541015, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.6002209186553955, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17711368948221207, "step": 4622 }, { "epoch": 0.06903142475305923, "grad_norm": 0.37890625, "grad_norm_var": 0.0009129683176676433, "learning_rate": 0.0001, "loss": 1.6558, "loss/crossentropy": 2.8199092149734497, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2222471758723259, "step": 4623 }, { "epoch": 0.06904635692367421, "grad_norm": 0.37890625, "grad_norm_var": 0.0009134928385416667, "learning_rate": 0.0001, "loss": 1.5172, "loss/crossentropy": 2.7897231578826904, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19296090304851532, "step": 4624 }, { "epoch": 0.06906128909428919, "grad_norm": 0.36328125, "grad_norm_var": 0.0009195327758789063, "learning_rate": 0.0001, "loss": 1.5968, "loss/crossentropy": 2.428196668624878, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.2139851301908493, "step": 4625 }, { "epoch": 0.06907622126490418, "grad_norm": 0.322265625, "grad_norm_var": 0.0009274800618489583, "learning_rate": 0.0001, "loss": 1.4349, "loss/crossentropy": 2.441898465156555, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.16930925846099854, "step": 4626 }, { "epoch": 0.06909115343551915, "grad_norm": 0.4140625, "grad_norm_var": 0.0010522842407226563, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.4072506427764893, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.1888575777411461, "step": 4627 }, { "epoch": 0.06910608560613414, "grad_norm": 0.380859375, "grad_norm_var": 0.0008552392323811849, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.5075007677078247, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18153245747089386, "step": 4628 }, { "epoch": 0.06912101777674912, "grad_norm": 0.349609375, "grad_norm_var": 0.0008783817291259765, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.5191659927368164, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18464366346597672, "step": 4629 }, { "epoch": 0.06913594994736409, "grad_norm": 0.478515625, "grad_norm_var": 0.0016380151112874348, "learning_rate": 0.0001, "loss": 1.7037, "loss/crossentropy": 2.4684442281723022, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2544800639152527, "step": 4630 }, { "epoch": 0.06915088211797908, "grad_norm": 0.365234375, "grad_norm_var": 0.0015223026275634766, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 2.574568510055542, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.22604576498270035, "step": 4631 }, { "epoch": 0.06916581428859406, "grad_norm": 0.388671875, "grad_norm_var": 0.00147857666015625, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.6515496969223022, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.2216154932975769, "step": 4632 }, { "epoch": 0.06918074645920905, "grad_norm": 0.333984375, "grad_norm_var": 0.0015201409657796224, "learning_rate": 0.0001, "loss": 1.3201, "loss/crossentropy": 2.673411011695862, "loss/fcd": 1.16796875, "loss/idx": 9.0, "loss/logits": 0.15208247303962708, "step": 4633 }, { "epoch": 0.06919567862982402, "grad_norm": 0.40234375, "grad_norm_var": 0.0015649795532226562, "learning_rate": 0.0001, "loss": 1.6386, "loss/crossentropy": 2.6933573484420776, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.2205968201160431, "step": 4634 }, { "epoch": 0.06921061080043901, "grad_norm": 0.341796875, "grad_norm_var": 0.00150909423828125, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.623626232147217, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.1874958798289299, "step": 4635 }, { "epoch": 0.06922554297105399, "grad_norm": 0.35546875, "grad_norm_var": 0.0015253067016601563, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.5508402585983276, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18966709077358246, "step": 4636 }, { "epoch": 0.06924047514166896, "grad_norm": 0.3671875, "grad_norm_var": 0.0013604323069254557, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.630552887916565, "loss/fcd": 1.265625, "loss/idx": 9.0, "loss/logits": 0.17499098926782608, "step": 4637 }, { "epoch": 0.06925540731228395, "grad_norm": 0.31640625, "grad_norm_var": 0.0015543619791666666, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.418249487876892, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19940617680549622, "step": 4638 }, { "epoch": 0.06927033948289893, "grad_norm": 0.375, "grad_norm_var": 0.0015512466430664062, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.6730518341064453, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18544083088636398, "step": 4639 }, { "epoch": 0.06928527165351392, "grad_norm": 0.51171875, "grad_norm_var": 0.002796363830566406, "learning_rate": 0.0001, "loss": 2.1066, "loss/crossentropy": 2.510825991630554, "loss/fcd": 1.8359375, "loss/idx": 9.0, "loss/logits": 0.2707029730081558, "step": 4640 }, { "epoch": 0.0693002038241289, "grad_norm": 0.32421875, "grad_norm_var": 0.002974383036295573, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.590042233467102, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.1710788607597351, "step": 4641 }, { "epoch": 0.06931513599474387, "grad_norm": 0.46875, "grad_norm_var": 0.0032521406809488933, "learning_rate": 0.0001, "loss": 1.5328, "loss/crossentropy": 2.196318566799164, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.1734093278646469, "step": 4642 }, { "epoch": 0.06933006816535886, "grad_norm": 0.4296875, "grad_norm_var": 0.0033261458079020183, "learning_rate": 0.0001, "loss": 1.6979, "loss/crossentropy": 2.426760196685791, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.20961295813322067, "step": 4643 }, { "epoch": 0.06934500033597384, "grad_norm": 0.4453125, "grad_norm_var": 0.003534380594889323, "learning_rate": 0.0001, "loss": 1.7023, "loss/crossentropy": 2.2811635732650757, "loss/fcd": 1.50390625, "loss/idx": 9.0, "loss/logits": 0.1983536034822464, "step": 4644 }, { "epoch": 0.06935993250658883, "grad_norm": 0.50390625, "grad_norm_var": 0.0041735172271728516, "learning_rate": 0.0001, "loss": 1.7767, "loss/crossentropy": 2.3442111015319824, "loss/fcd": 1.5703125, "loss/idx": 9.0, "loss/logits": 0.20637919753789902, "step": 4645 }, { "epoch": 0.0693748646772038, "grad_norm": 0.34375, "grad_norm_var": 0.003907012939453125, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.5363729000091553, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16949527710676193, "step": 4646 }, { "epoch": 0.06938979684781878, "grad_norm": 0.349609375, "grad_norm_var": 0.003978220621744791, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.540289044380188, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17694628238677979, "step": 4647 }, { "epoch": 0.06940472901843377, "grad_norm": 0.421875, "grad_norm_var": 0.004036315282185872, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.510650396347046, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20573686063289642, "step": 4648 }, { "epoch": 0.06941966118904874, "grad_norm": 0.4765625, "grad_norm_var": 0.0041813532511393225, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.4546282291412354, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19333051145076752, "step": 4649 }, { "epoch": 0.06943459335966373, "grad_norm": 0.396484375, "grad_norm_var": 0.0041833082834879555, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.7026443481445312, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.163864865899086, "step": 4650 }, { "epoch": 0.06944952553027871, "grad_norm": 0.44140625, "grad_norm_var": 0.004007403055826823, "learning_rate": 0.0001, "loss": 1.4879, "loss/crossentropy": 2.7281938791275024, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.18321123719215393, "step": 4651 }, { "epoch": 0.06946445770089368, "grad_norm": 0.359375, "grad_norm_var": 0.00398101806640625, "learning_rate": 0.0001, "loss": 1.533, "loss/crossentropy": 2.585829496383667, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.17750875651836395, "step": 4652 }, { "epoch": 0.06947938987150867, "grad_norm": 0.38671875, "grad_norm_var": 0.0038980484008789063, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.9839770793914795, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17762264609336853, "step": 4653 }, { "epoch": 0.06949432204212365, "grad_norm": 0.4296875, "grad_norm_var": 0.003295135498046875, "learning_rate": 0.0001, "loss": 1.6329, "loss/crossentropy": 2.8026371002197266, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.20320799946784973, "step": 4654 }, { "epoch": 0.06950925421273864, "grad_norm": 0.37109375, "grad_norm_var": 0.003317705790201823, "learning_rate": 0.0001, "loss": 1.6587, "loss/crossentropy": 2.5433706045150757, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.21725529432296753, "step": 4655 }, { "epoch": 0.06952418638335361, "grad_norm": 0.357421875, "grad_norm_var": 0.002841806411743164, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.614259958267212, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19175734370946884, "step": 4656 }, { "epoch": 0.0695391185539686, "grad_norm": 0.322265625, "grad_norm_var": 0.002863502502441406, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.672187566757202, "loss/fcd": 1.203125, "loss/idx": 9.0, "loss/logits": 0.16869863867759705, "step": 4657 }, { "epoch": 0.06955405072458358, "grad_norm": 0.40625, "grad_norm_var": 0.002588844299316406, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.663278818130493, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.19973981380462646, "step": 4658 }, { "epoch": 0.06956898289519856, "grad_norm": 0.375, "grad_norm_var": 0.0025781631469726563, "learning_rate": 0.0001, "loss": 1.5177, "loss/crossentropy": 2.47927463054657, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.1934349164366722, "step": 4659 }, { "epoch": 0.06958391506581355, "grad_norm": 0.390625, "grad_norm_var": 0.002428627014160156, "learning_rate": 0.0001, "loss": 1.6288, "loss/crossentropy": 2.6201950311660767, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22643938660621643, "step": 4660 }, { "epoch": 0.06959884723642852, "grad_norm": 0.359375, "grad_norm_var": 0.0016499837239583334, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.7761462926864624, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18739421665668488, "step": 4661 }, { "epoch": 0.06961377940704351, "grad_norm": 0.34765625, "grad_norm_var": 0.001628557840983073, "learning_rate": 0.0001, "loss": 1.5275, "loss/crossentropy": 2.7403411865234375, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.1993524581193924, "step": 4662 }, { "epoch": 0.06962871157765849, "grad_norm": 0.40625, "grad_norm_var": 0.0015469710032145181, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.3722277879714966, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.19862627983093262, "step": 4663 }, { "epoch": 0.06964364374827346, "grad_norm": 0.404296875, "grad_norm_var": 0.001492754618326823, "learning_rate": 0.0001, "loss": 1.5239, "loss/crossentropy": 2.7442073822021484, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.184088796377182, "step": 4664 }, { "epoch": 0.06965857591888845, "grad_norm": 0.380859375, "grad_norm_var": 0.0009530226389567058, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.7476714849472046, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.18183152377605438, "step": 4665 }, { "epoch": 0.06967350808950343, "grad_norm": 0.5078125, "grad_norm_var": 0.0019215265909830728, "learning_rate": 0.0001, "loss": 1.5818, "loss/crossentropy": 2.5408568382263184, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.19503448903560638, "step": 4666 }, { "epoch": 0.06968844026011842, "grad_norm": 0.3671875, "grad_norm_var": 0.0017608642578125, "learning_rate": 0.0001, "loss": 1.6716, "loss/crossentropy": 2.722986102104187, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.2380080297589302, "step": 4667 }, { "epoch": 0.06970337243073339, "grad_norm": 0.3671875, "grad_norm_var": 0.001737213134765625, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.5299289226531982, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17594074457883835, "step": 4668 }, { "epoch": 0.06971830460134837, "grad_norm": 0.35546875, "grad_norm_var": 0.0017962137858072917, "learning_rate": 0.0001, "loss": 1.5478, "loss/crossentropy": 2.5981128215789795, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.2040254846215248, "step": 4669 }, { "epoch": 0.06973323677196336, "grad_norm": 0.326171875, "grad_norm_var": 0.0018391768137613932, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.531408429145813, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1749648079276085, "step": 4670 }, { "epoch": 0.06974816894257833, "grad_norm": 0.318359375, "grad_norm_var": 0.002060190836588542, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.5272799730300903, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.16985540091991425, "step": 4671 }, { "epoch": 0.06976310111319332, "grad_norm": 0.37890625, "grad_norm_var": 0.00204008420308431, "learning_rate": 0.0001, "loss": 1.5371, "loss/crossentropy": 2.483139753341675, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19338900595903397, "step": 4672 }, { "epoch": 0.0697780332838083, "grad_norm": 0.345703125, "grad_norm_var": 0.0019069512685139973, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.5877264738082886, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.1863800212740898, "step": 4673 }, { "epoch": 0.06979296545442329, "grad_norm": 0.345703125, "grad_norm_var": 0.0019025166829427084, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.497534990310669, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.178814098238945, "step": 4674 }, { "epoch": 0.06980789762503826, "grad_norm": 0.39453125, "grad_norm_var": 0.0019301732381184896, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.6341124773025513, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.1907498985528946, "step": 4675 }, { "epoch": 0.06982282979565324, "grad_norm": 0.322265625, "grad_norm_var": 0.002077595392862956, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.439703106880188, "loss/fcd": 1.21484375, "loss/idx": 9.0, "loss/logits": 0.15139994770288467, "step": 4676 }, { "epoch": 0.06983776196626823, "grad_norm": 0.44921875, "grad_norm_var": 0.0024490197499593097, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 2.8108627796173096, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19715536385774612, "step": 4677 }, { "epoch": 0.0698526941368832, "grad_norm": 0.34765625, "grad_norm_var": 0.0024490197499593097, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.5290677547454834, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.16350556910037994, "step": 4678 }, { "epoch": 0.0698676263074982, "grad_norm": 0.41015625, "grad_norm_var": 0.002465677261352539, "learning_rate": 0.0001, "loss": 1.8537, "loss/crossentropy": 2.8027788400650024, "loss/fcd": 1.546875, "loss/idx": 9.0, "loss/logits": 0.3067924231290817, "step": 4679 }, { "epoch": 0.06988255847811317, "grad_norm": 0.376953125, "grad_norm_var": 0.0024104913075764975, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.655419945716858, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17910179495811462, "step": 4680 }, { "epoch": 0.06989749064872815, "grad_norm": 0.419921875, "grad_norm_var": 0.002538283665974935, "learning_rate": 0.0001, "loss": 1.7076, "loss/crossentropy": 2.655218720436096, "loss/fcd": 1.4765625, "loss/idx": 9.0, "loss/logits": 0.23108042776584625, "step": 4681 }, { "epoch": 0.06991242281934314, "grad_norm": 0.37890625, "grad_norm_var": 0.001329787572224935, "learning_rate": 0.0001, "loss": 1.6027, "loss/crossentropy": 2.49862802028656, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.20820432901382446, "step": 4682 }, { "epoch": 0.06992735498995811, "grad_norm": 0.376953125, "grad_norm_var": 0.0013333638509114584, "learning_rate": 0.0001, "loss": 1.5443, "loss/crossentropy": 2.5892597436904907, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.20055770128965378, "step": 4683 }, { "epoch": 0.0699422871605731, "grad_norm": 0.3671875, "grad_norm_var": 0.0013333638509114584, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.7190572023391724, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.17734478414058685, "step": 4684 }, { "epoch": 0.06995721933118808, "grad_norm": 0.439453125, "grad_norm_var": 0.0016156355539957681, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.7466397285461426, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17895758152008057, "step": 4685 }, { "epoch": 0.06997215150180305, "grad_norm": 0.34765625, "grad_norm_var": 0.0015049616495768229, "learning_rate": 0.0001, "loss": 1.4823, "loss/crossentropy": 2.5602136850357056, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18937475979328156, "step": 4686 }, { "epoch": 0.06998708367241804, "grad_norm": 0.41015625, "grad_norm_var": 0.0013234297434488933, "learning_rate": 0.0001, "loss": 1.7191, "loss/crossentropy": 2.342607259750366, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.23085525631904602, "step": 4687 }, { "epoch": 0.07000201584303302, "grad_norm": 0.373046875, "grad_norm_var": 0.001327959696451823, "learning_rate": 0.0001, "loss": 1.5921, "loss/crossentropy": 2.677141785621643, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.22103843837976456, "step": 4688 }, { "epoch": 0.07001694801364801, "grad_norm": 0.4140625, "grad_norm_var": 0.0012929121653238933, "learning_rate": 0.0001, "loss": 1.7171, "loss/crossentropy": 2.137522339820862, "loss/fcd": 1.4921875, "loss/idx": 9.0, "loss/logits": 0.22489557415246964, "step": 4689 }, { "epoch": 0.07003188018426298, "grad_norm": 0.396484375, "grad_norm_var": 0.0011821587880452473, "learning_rate": 0.0001, "loss": 1.4752, "loss/crossentropy": 2.536611557006836, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18225625157356262, "step": 4690 }, { "epoch": 0.07004681235487796, "grad_norm": 0.416015625, "grad_norm_var": 0.0012267430623372395, "learning_rate": 0.0001, "loss": 1.7468, "loss/crossentropy": 2.4955257177352905, "loss/fcd": 1.515625, "loss/idx": 9.0, "loss/logits": 0.23119476437568665, "step": 4691 }, { "epoch": 0.07006174452549295, "grad_norm": 0.369140625, "grad_norm_var": 0.0009383519490559895, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.520110607147217, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19361094385385513, "step": 4692 }, { "epoch": 0.07007667669610793, "grad_norm": 0.384765625, "grad_norm_var": 0.0007175286610921224, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.4781934022903442, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20506486296653748, "step": 4693 }, { "epoch": 0.07009160886672292, "grad_norm": 0.35546875, "grad_norm_var": 0.0006779829661051432, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.711501717567444, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20666850358247757, "step": 4694 }, { "epoch": 0.07010654103733789, "grad_norm": 0.375, "grad_norm_var": 0.0006596724192301433, "learning_rate": 0.0001, "loss": 1.5777, "loss/crossentropy": 2.6589282751083374, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.20265083760023117, "step": 4695 }, { "epoch": 0.07012147320795288, "grad_norm": 0.376953125, "grad_norm_var": 0.0006596724192301433, "learning_rate": 0.0001, "loss": 1.6497, "loss/crossentropy": 2.2846230268478394, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.23177475482225418, "step": 4696 }, { "epoch": 0.07013640537856786, "grad_norm": 0.431640625, "grad_norm_var": 0.0007188002268473308, "learning_rate": 0.0001, "loss": 1.6861, "loss/crossentropy": 2.47269868850708, "loss/fcd": 1.46484375, "loss/idx": 9.0, "loss/logits": 0.22124120593070984, "step": 4697 }, { "epoch": 0.07015133754918283, "grad_norm": 0.3515625, "grad_norm_var": 0.0007997989654541015, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.312289595603943, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.15700392425060272, "step": 4698 }, { "epoch": 0.07016626971979782, "grad_norm": 0.44140625, "grad_norm_var": 0.0009765625, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.72152316570282, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1913422867655754, "step": 4699 }, { "epoch": 0.0701812018904128, "grad_norm": 0.36328125, "grad_norm_var": 0.0009897232055664062, "learning_rate": 0.0001, "loss": 1.5031, "loss/crossentropy": 2.5160927772521973, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19062026590108871, "step": 4700 }, { "epoch": 0.07019613406102779, "grad_norm": 0.6171875, "grad_norm_var": 0.004126977920532226, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.629739999771118, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.19424261897802353, "step": 4701 }, { "epoch": 0.07021106623164276, "grad_norm": 0.33984375, "grad_norm_var": 0.004186868667602539, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.533465623855591, "loss/fcd": 1.22265625, "loss/idx": 9.0, "loss/logits": 0.1569589227437973, "step": 4702 }, { "epoch": 0.07022599840225774, "grad_norm": 0.50390625, "grad_norm_var": 0.004850625991821289, "learning_rate": 0.0001, "loss": 1.6169, "loss/crossentropy": 2.573672652244568, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.2145870178937912, "step": 4703 }, { "epoch": 0.07024093057287273, "grad_norm": 0.3828125, "grad_norm_var": 0.004812558492024739, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.76527738571167, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18561843782663345, "step": 4704 }, { "epoch": 0.0702558627434877, "grad_norm": 0.4296875, "grad_norm_var": 0.00484155019124349, "learning_rate": 0.0001, "loss": 1.6586, "loss/crossentropy": 2.476593017578125, "loss/fcd": 1.45703125, "loss/idx": 9.0, "loss/logits": 0.20156704634428024, "step": 4705 }, { "epoch": 0.0702707949141027, "grad_norm": 0.4140625, "grad_norm_var": 0.004832824071248372, "learning_rate": 0.0001, "loss": 1.653, "loss/crossentropy": 2.7569010257720947, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.22721782326698303, "step": 4706 }, { "epoch": 0.07028572708471767, "grad_norm": 0.3359375, "grad_norm_var": 0.005164527893066406, "learning_rate": 0.0001, "loss": 1.4688, "loss/crossentropy": 2.6862664222717285, "loss/fcd": 1.28125, "loss/idx": 9.0, "loss/logits": 0.18752942979335785, "step": 4707 }, { "epoch": 0.07030065925533265, "grad_norm": 0.353515625, "grad_norm_var": 0.00525353749593099, "learning_rate": 0.0001, "loss": 1.4896, "loss/crossentropy": 2.457239508628845, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.181009940803051, "step": 4708 }, { "epoch": 0.07031559142594764, "grad_norm": 0.37890625, "grad_norm_var": 0.00527036984761556, "learning_rate": 0.0001, "loss": 1.5725, "loss/crossentropy": 2.3938353061676025, "loss/fcd": 1.375, "loss/idx": 9.0, "loss/logits": 0.19751834869384766, "step": 4709 }, { "epoch": 0.07033052359656261, "grad_norm": 0.353515625, "grad_norm_var": 0.005283037821451823, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.629960536956787, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.19032851606607437, "step": 4710 }, { "epoch": 0.0703454557671776, "grad_norm": 0.43359375, "grad_norm_var": 0.005278269449869792, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.642956852912903, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20528195053339005, "step": 4711 }, { "epoch": 0.07036038793779258, "grad_norm": 0.375, "grad_norm_var": 0.005286264419555664, "learning_rate": 0.0001, "loss": 1.591, "loss/crossentropy": 2.522721290588379, "loss/fcd": 1.3828125, "loss/idx": 9.0, "loss/logits": 0.20815794169902802, "step": 4712 }, { "epoch": 0.07037532010840755, "grad_norm": 0.3515625, "grad_norm_var": 0.0054198582967122395, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.462031602859497, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.23598036170005798, "step": 4713 }, { "epoch": 0.07039025227902254, "grad_norm": 0.365234375, "grad_norm_var": 0.005340305964152018, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.7923320531845093, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1754857823252678, "step": 4714 }, { "epoch": 0.07040518444963752, "grad_norm": 0.361328125, "grad_norm_var": 0.0053253173828125, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.568281650543213, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.22785043716430664, "step": 4715 }, { "epoch": 0.07042011662025251, "grad_norm": 0.404296875, "grad_norm_var": 0.005243539810180664, "learning_rate": 0.0001, "loss": 1.635, "loss/crossentropy": 2.4628924131393433, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21698708832263947, "step": 4716 }, { "epoch": 0.07043504879086748, "grad_norm": 0.3515625, "grad_norm_var": 0.001962137222290039, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.504340648651123, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.19222097098827362, "step": 4717 }, { "epoch": 0.07044998096148247, "grad_norm": 0.404296875, "grad_norm_var": 0.0018472671508789062, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.704387903213501, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.1718045100569725, "step": 4718 }, { "epoch": 0.07046491313209745, "grad_norm": 0.50390625, "grad_norm_var": 0.0018472671508789062, "learning_rate": 0.0001, "loss": 1.7195, "loss/crossentropy": 2.3803107738494873, "loss/fcd": 1.51171875, "loss/idx": 9.0, "loss/logits": 0.20779212564229965, "step": 4719 }, { "epoch": 0.07047984530271242, "grad_norm": 0.421875, "grad_norm_var": 0.001918474833170573, "learning_rate": 0.0001, "loss": 1.6059, "loss/crossentropy": 2.3779784440994263, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.21135350316762924, "step": 4720 }, { "epoch": 0.07049477747332741, "grad_norm": 0.376953125, "grad_norm_var": 0.0018124739329020182, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.488665819168091, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.1599768102169037, "step": 4721 }, { "epoch": 0.07050970964394239, "grad_norm": 0.369140625, "grad_norm_var": 0.0017740885416666667, "learning_rate": 0.0001, "loss": 1.5303, "loss/crossentropy": 2.47546648979187, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.20609134435653687, "step": 4722 }, { "epoch": 0.07052464181455738, "grad_norm": 0.33984375, "grad_norm_var": 0.0017501195271809897, "learning_rate": 0.0001, "loss": 1.5442, "loss/crossentropy": 2.471246123313904, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.19656182825565338, "step": 4723 }, { "epoch": 0.07053957398517235, "grad_norm": 0.40625, "grad_norm_var": 0.0017093499501546225, "learning_rate": 0.0001, "loss": 1.6587, "loss/crossentropy": 2.814807415008545, "loss/fcd": 1.43359375, "loss/idx": 9.0, "loss/logits": 0.22508594393730164, "step": 4724 }, { "epoch": 0.07055450615578733, "grad_norm": 0.4296875, "grad_norm_var": 0.0018134911855061848, "learning_rate": 0.0001, "loss": 1.4808, "loss/crossentropy": 2.525724411010742, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18781042098999023, "step": 4725 }, { "epoch": 0.07056943832640232, "grad_norm": 0.353515625, "grad_norm_var": 0.0018134911855061848, "learning_rate": 0.0001, "loss": 1.4974, "loss/crossentropy": 2.466606616973877, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18881310522556305, "step": 4726 }, { "epoch": 0.0705843704970173, "grad_norm": 0.353515625, "grad_norm_var": 0.0017541885375976563, "learning_rate": 0.0001, "loss": 1.4992, "loss/crossentropy": 2.5930986404418945, "loss/fcd": 1.3046875, "loss/idx": 9.0, "loss/logits": 0.1944858506321907, "step": 4727 }, { "epoch": 0.07059930266763229, "grad_norm": 0.400390625, "grad_norm_var": 0.0017589410146077475, "learning_rate": 0.0001, "loss": 1.5361, "loss/crossentropy": 2.856569290161133, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.1923227235674858, "step": 4728 }, { "epoch": 0.07061423483824726, "grad_norm": 0.423828125, "grad_norm_var": 0.0017430623372395833, "learning_rate": 0.0001, "loss": 1.6122, "loss/crossentropy": 2.4639110565185547, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.20985673367977142, "step": 4729 }, { "epoch": 0.07062916700886224, "grad_norm": 0.404296875, "grad_norm_var": 0.0017011006673177083, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.5362350940704346, "loss/fcd": 1.203125, "loss/idx": 9.0, "loss/logits": 0.1560811921954155, "step": 4730 }, { "epoch": 0.07064409917947723, "grad_norm": 0.353515625, "grad_norm_var": 0.0017389933268229167, "learning_rate": 0.0001, "loss": 1.4046, "loss/crossentropy": 2.549067974090576, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.17410427331924438, "step": 4731 }, { "epoch": 0.0706590313500922, "grad_norm": 0.361328125, "grad_norm_var": 0.0017928441365559896, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.724418044090271, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.18190618604421616, "step": 4732 }, { "epoch": 0.07067396352070719, "grad_norm": 0.4375, "grad_norm_var": 0.001804033915201823, "learning_rate": 0.0001, "loss": 1.8833, "loss/crossentropy": 2.265506148338318, "loss/fcd": 1.56640625, "loss/idx": 9.0, "loss/logits": 0.31688016653060913, "step": 4733 }, { "epoch": 0.07068889569132217, "grad_norm": 0.40234375, "grad_norm_var": 0.0018021742502848308, "learning_rate": 0.0001, "loss": 1.5105, "loss/crossentropy": 2.8834331035614014, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19800810515880585, "step": 4734 }, { "epoch": 0.07070382786193716, "grad_norm": 0.34375, "grad_norm_var": 0.0011035760243733725, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.6644831895828247, "loss/fcd": 1.2421875, "loss/idx": 9.0, "loss/logits": 0.1753660961985588, "step": 4735 }, { "epoch": 0.07071876003255213, "grad_norm": 0.4609375, "grad_norm_var": 0.0013852278391520182, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.591984272003174, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.2012152448296547, "step": 4736 }, { "epoch": 0.07073369220316711, "grad_norm": 0.341796875, "grad_norm_var": 0.0015168348948160807, "learning_rate": 0.0001, "loss": 1.5362, "loss/crossentropy": 2.750018000602722, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.2002265602350235, "step": 4737 }, { "epoch": 0.0707486243737821, "grad_norm": 0.38671875, "grad_norm_var": 0.001495806376139323, "learning_rate": 0.0001, "loss": 1.6224, "loss/crossentropy": 2.732465147972107, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.22006835043430328, "step": 4738 }, { "epoch": 0.07076355654439707, "grad_norm": 0.404296875, "grad_norm_var": 0.0013463179270426433, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.6628313064575195, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.21346353739500046, "step": 4739 }, { "epoch": 0.07077848871501206, "grad_norm": 0.431640625, "grad_norm_var": 0.001436614990234375, "learning_rate": 0.0001, "loss": 1.6839, "loss/crossentropy": 2.5808838605880737, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.23471584171056747, "step": 4740 }, { "epoch": 0.07079342088562704, "grad_norm": 0.38671875, "grad_norm_var": 0.0013422012329101563, "learning_rate": 0.0001, "loss": 1.5328, "loss/crossentropy": 2.92130708694458, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.2085987702012062, "step": 4741 }, { "epoch": 0.07080835305624202, "grad_norm": 0.3515625, "grad_norm_var": 0.0013520399729410808, "learning_rate": 0.0001, "loss": 1.5132, "loss/crossentropy": 2.65986967086792, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.19674497842788696, "step": 4742 }, { "epoch": 0.070823285226857, "grad_norm": 0.482421875, "grad_norm_var": 0.0017590681711832682, "learning_rate": 0.0001, "loss": 2.0946, "loss/crossentropy": 2.4828158617019653, "loss/fcd": 1.72265625, "loss/idx": 9.0, "loss/logits": 0.37189508974552155, "step": 4743 }, { "epoch": 0.07083821739747198, "grad_norm": 0.50390625, "grad_norm_var": 0.002457427978515625, "learning_rate": 0.0001, "loss": 1.5851, "loss/crossentropy": 2.550639510154724, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.19449549168348312, "step": 4744 }, { "epoch": 0.07085314956808697, "grad_norm": 0.34375, "grad_norm_var": 0.002654886245727539, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.5517311096191406, "loss/fcd": 1.28515625, "loss/idx": 9.0, "loss/logits": 0.17168083786964417, "step": 4745 }, { "epoch": 0.07086808173870195, "grad_norm": 0.439453125, "grad_norm_var": 0.0027533054351806642, "learning_rate": 0.0001, "loss": 1.6426, "loss/crossentropy": 2.8957414627075195, "loss/fcd": 1.40234375, "loss/idx": 9.0, "loss/logits": 0.24025997519493103, "step": 4746 }, { "epoch": 0.07088301390931692, "grad_norm": 0.369140625, "grad_norm_var": 0.0026676019032796225, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.6141244173049927, "loss/fcd": 1.21875, "loss/idx": 9.0, "loss/logits": 0.16885926574468613, "step": 4747 }, { "epoch": 0.07089794607993191, "grad_norm": 0.36328125, "grad_norm_var": 0.002657000223795573, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.572378635406494, "loss/fcd": 1.34375, "loss/idx": 9.0, "loss/logits": 0.18622738122940063, "step": 4748 }, { "epoch": 0.07091287825054689, "grad_norm": 0.376953125, "grad_norm_var": 0.002608219782511393, "learning_rate": 0.0001, "loss": 1.5441, "loss/crossentropy": 2.721894145011902, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.1885867863893509, "step": 4749 }, { "epoch": 0.07092781042116188, "grad_norm": 0.388671875, "grad_norm_var": 0.0026143391927083335, "learning_rate": 0.0001, "loss": 1.5664, "loss/crossentropy": 2.785942554473877, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.20702239125967026, "step": 4750 }, { "epoch": 0.07094274259177685, "grad_norm": 0.458984375, "grad_norm_var": 0.002604023615519206, "learning_rate": 0.0001, "loss": 1.5489, "loss/crossentropy": 2.6177486181259155, "loss/fcd": 1.3359375, "loss/idx": 9.0, "loss/logits": 0.21299625933170319, "step": 4751 }, { "epoch": 0.07095767476239183, "grad_norm": 0.3671875, "grad_norm_var": 0.0024621168772379558, "learning_rate": 0.0001, "loss": 1.6044, "loss/crossentropy": 2.566752552986145, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.21768267452716827, "step": 4752 }, { "epoch": 0.07097260693300682, "grad_norm": 0.423828125, "grad_norm_var": 0.0022484938303629558, "learning_rate": 0.0001, "loss": 1.5773, "loss/crossentropy": 2.576554298400879, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.19842387735843658, "step": 4753 }, { "epoch": 0.0709875391036218, "grad_norm": 0.37890625, "grad_norm_var": 0.002271254857381185, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.5249335765838623, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1767229586839676, "step": 4754 }, { "epoch": 0.07100247127423678, "grad_norm": 0.353515625, "grad_norm_var": 0.0024332523345947264, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.461852788925171, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.18766435980796814, "step": 4755 }, { "epoch": 0.07101740344485176, "grad_norm": 0.34375, "grad_norm_var": 0.002559852600097656, "learning_rate": 0.0001, "loss": 1.4636, "loss/crossentropy": 2.3259220123291016, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.16675277799367905, "step": 4756 }, { "epoch": 0.07103233561546675, "grad_norm": 0.36328125, "grad_norm_var": 0.0026224136352539064, "learning_rate": 0.0001, "loss": 1.6304, "loss/crossentropy": 2.480155110359192, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.23199378699064255, "step": 4757 }, { "epoch": 0.07104726778608172, "grad_norm": 0.4765625, "grad_norm_var": 0.0028868993123372395, "learning_rate": 0.0001, "loss": 1.6908, "loss/crossentropy": 2.7127811908721924, "loss/fcd": 1.46875, "loss/idx": 9.0, "loss/logits": 0.22201323509216309, "step": 4758 }, { "epoch": 0.0710621999566967, "grad_norm": 0.43359375, "grad_norm_var": 0.0025129795074462892, "learning_rate": 0.0001, "loss": 1.6319, "loss/crossentropy": 2.591728687286377, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2256508767604828, "step": 4759 }, { "epoch": 0.07107713212731169, "grad_norm": 0.322265625, "grad_norm_var": 0.0020355224609375, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.551482677459717, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.15033076703548431, "step": 4760 }, { "epoch": 0.07109206429792667, "grad_norm": 0.37109375, "grad_norm_var": 0.0019220352172851563, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.6695351600646973, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.20501495897769928, "step": 4761 }, { "epoch": 0.07110699646854166, "grad_norm": 0.3671875, "grad_norm_var": 0.001766188939412435, "learning_rate": 0.0001, "loss": 1.4331, "loss/crossentropy": 2.4585793018341064, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17142903804779053, "step": 4762 }, { "epoch": 0.07112192863915663, "grad_norm": 0.36328125, "grad_norm_var": 0.0017806371053059897, "learning_rate": 0.0001, "loss": 1.5021, "loss/crossentropy": 2.7831469774246216, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.19348234683275223, "step": 4763 }, { "epoch": 0.07113686080977161, "grad_norm": 0.3984375, "grad_norm_var": 0.0017583211263020833, "learning_rate": 0.0001, "loss": 1.586, "loss/crossentropy": 2.690499186515808, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.207086019217968, "step": 4764 }, { "epoch": 0.0711517929803866, "grad_norm": 0.3515625, "grad_norm_var": 0.0018316745758056641, "learning_rate": 0.0001, "loss": 1.5844, "loss/crossentropy": 2.4656976461410522, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.20545487105846405, "step": 4765 }, { "epoch": 0.07116672515100157, "grad_norm": 0.375, "grad_norm_var": 0.0018369038899739584, "learning_rate": 0.0001, "loss": 1.62, "loss/crossentropy": 2.6565665006637573, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.2293902039527893, "step": 4766 }, { "epoch": 0.07118165732161656, "grad_norm": 0.361328125, "grad_norm_var": 0.0014602025349934897, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.462925434112549, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21091777086257935, "step": 4767 }, { "epoch": 0.07119658949223154, "grad_norm": 0.4296875, "grad_norm_var": 0.0016127904256184897, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.484135150909424, "loss/fcd": 1.18359375, "loss/idx": 9.0, "loss/logits": 0.15200571715831757, "step": 4768 }, { "epoch": 0.07121152166284651, "grad_norm": 0.45703125, "grad_norm_var": 0.0018665154774983725, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.568774104118347, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21882455050945282, "step": 4769 }, { "epoch": 0.0712264538334615, "grad_norm": 0.41796875, "grad_norm_var": 0.0019345442454020182, "learning_rate": 0.0001, "loss": 1.6113, "loss/crossentropy": 2.369409203529358, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.20113491266965866, "step": 4770 }, { "epoch": 0.07124138600407648, "grad_norm": 0.34765625, "grad_norm_var": 0.0019625345865885418, "learning_rate": 0.0001, "loss": 1.6211, "loss/crossentropy": 2.4039554595947266, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.23050130903720856, "step": 4771 }, { "epoch": 0.07125631817469147, "grad_norm": 0.53125, "grad_norm_var": 0.003097788492838542, "learning_rate": 0.0001, "loss": 2.2556, "loss/crossentropy": 2.464118003845215, "loss/fcd": 1.9296875, "loss/idx": 9.0, "loss/logits": 0.32593894749879837, "step": 4772 }, { "epoch": 0.07127125034530644, "grad_norm": 0.375, "grad_norm_var": 0.003052202860514323, "learning_rate": 0.0001, "loss": 1.4657, "loss/crossentropy": 2.7045055627822876, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17269982397556305, "step": 4773 }, { "epoch": 0.07128618251592142, "grad_norm": 0.53125, "grad_norm_var": 0.0038070042928059896, "learning_rate": 0.0001, "loss": 1.9623, "loss/crossentropy": 2.807793140411377, "loss/fcd": 1.6796875, "loss/idx": 9.0, "loss/logits": 0.2825971692800522, "step": 4774 }, { "epoch": 0.07130111468653641, "grad_norm": 0.48046875, "grad_norm_var": 0.004141171773274739, "learning_rate": 0.0001, "loss": 1.6532, "loss/crossentropy": 2.8782676458358765, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.21573622524738312, "step": 4775 }, { "epoch": 0.07131604685715139, "grad_norm": 0.376953125, "grad_norm_var": 0.0037246068318684896, "learning_rate": 0.0001, "loss": 1.5394, "loss/crossentropy": 2.496760845184326, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.18780288100242615, "step": 4776 }, { "epoch": 0.07133097902776638, "grad_norm": 0.404296875, "grad_norm_var": 0.0036281426747639973, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.8109374046325684, "loss/fcd": 1.36328125, "loss/idx": 9.0, "loss/logits": 0.2059365063905716, "step": 4777 }, { "epoch": 0.07134591119838135, "grad_norm": 0.369140625, "grad_norm_var": 0.003617095947265625, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.6132569313049316, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18586397916078568, "step": 4778 }, { "epoch": 0.07136084336899634, "grad_norm": 0.359375, "grad_norm_var": 0.0036427179972330728, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.6319929361343384, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16728521138429642, "step": 4779 }, { "epoch": 0.07137577553961132, "grad_norm": 0.373046875, "grad_norm_var": 0.0037235101064046223, "learning_rate": 0.0001, "loss": 1.7401, "loss/crossentropy": 2.6522542238235474, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.25183551013469696, "step": 4780 }, { "epoch": 0.07139070771022629, "grad_norm": 0.3671875, "grad_norm_var": 0.0036194960276285807, "learning_rate": 0.0001, "loss": 1.6377, "loss/crossentropy": 2.529379725456238, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.21580862253904343, "step": 4781 }, { "epoch": 0.07140563988084128, "grad_norm": 0.349609375, "grad_norm_var": 0.0037775675455729166, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.6329054832458496, "loss/fcd": 1.1875, "loss/idx": 9.0, "loss/logits": 0.14925313740968704, "step": 4782 }, { "epoch": 0.07142057205145626, "grad_norm": 0.486328125, "grad_norm_var": 0.003972880045572917, "learning_rate": 0.0001, "loss": 1.727, "loss/crossentropy": 2.692392587661743, "loss/fcd": 1.5, "loss/idx": 9.0, "loss/logits": 0.2269602045416832, "step": 4783 }, { "epoch": 0.07143550422207125, "grad_norm": 0.373046875, "grad_norm_var": 0.004070138931274414, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.6891673803329468, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.20392495393753052, "step": 4784 }, { "epoch": 0.07145043639268622, "grad_norm": 0.37890625, "grad_norm_var": 0.0039874871571858725, "learning_rate": 0.0001, "loss": 1.6178, "loss/crossentropy": 2.2839479446411133, "loss/fcd": 1.421875, "loss/idx": 9.0, "loss/logits": 0.19593389332294464, "step": 4785 }, { "epoch": 0.0714653685633012, "grad_norm": 0.345703125, "grad_norm_var": 0.004213905334472657, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.6092292070388794, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1832425594329834, "step": 4786 }, { "epoch": 0.07148030073391619, "grad_norm": 0.3515625, "grad_norm_var": 0.004185994466145833, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.6324630975723267, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.1721448302268982, "step": 4787 }, { "epoch": 0.07149523290453116, "grad_norm": 0.376953125, "grad_norm_var": 0.0030420780181884765, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.6746773719787598, "loss/fcd": 1.3984375, "loss/idx": 9.0, "loss/logits": 0.21097131818532944, "step": 4788 }, { "epoch": 0.07151016507514615, "grad_norm": 0.40625, "grad_norm_var": 0.0030252933502197266, "learning_rate": 0.0001, "loss": 1.632, "loss/crossentropy": 2.5249571800231934, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.21405182778835297, "step": 4789 }, { "epoch": 0.07152509724576113, "grad_norm": 0.3828125, "grad_norm_var": 0.0017182509104410808, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.729907274246216, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16746602952480316, "step": 4790 }, { "epoch": 0.0715400294163761, "grad_norm": 0.4453125, "grad_norm_var": 0.0013543287913004556, "learning_rate": 0.0001, "loss": 1.7078, "loss/crossentropy": 2.342802882194519, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.21166697889566422, "step": 4791 }, { "epoch": 0.0715549615869911, "grad_norm": 0.419921875, "grad_norm_var": 0.0014284610748291015, "learning_rate": 0.0001, "loss": 1.6662, "loss/crossentropy": 2.5830153226852417, "loss/fcd": 1.4375, "loss/idx": 9.0, "loss/logits": 0.22870098799467087, "step": 4792 }, { "epoch": 0.07156989375760607, "grad_norm": 0.37109375, "grad_norm_var": 0.0014200846354166666, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.6496236324310303, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18230195343494415, "step": 4793 }, { "epoch": 0.07158482592822106, "grad_norm": 0.4609375, "grad_norm_var": 0.0017555077870686849, "learning_rate": 0.0001, "loss": 1.4968, "loss/crossentropy": 2.618600368499756, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.18433918058872223, "step": 4794 }, { "epoch": 0.07159975809883604, "grad_norm": 0.361328125, "grad_norm_var": 0.0017476399739583333, "learning_rate": 0.0001, "loss": 1.473, "loss/crossentropy": 2.4650447368621826, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.180073544383049, "step": 4795 }, { "epoch": 0.07161469026945103, "grad_norm": 0.359375, "grad_norm_var": 0.00179136594136556, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.5120112895965576, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.2010418325662613, "step": 4796 }, { "epoch": 0.071629622440066, "grad_norm": 0.3984375, "grad_norm_var": 0.0017583052317301433, "learning_rate": 0.0001, "loss": 1.5066, "loss/crossentropy": 2.761707901954651, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.19406148046255112, "step": 4797 }, { "epoch": 0.07164455461068098, "grad_norm": 0.380859375, "grad_norm_var": 0.0016438643137613933, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.640535831451416, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19160054624080658, "step": 4798 }, { "epoch": 0.07165948678129597, "grad_norm": 0.375, "grad_norm_var": 0.0010431925455729166, "learning_rate": 0.0001, "loss": 1.5217, "loss/crossentropy": 2.8753509521484375, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19358526915311813, "step": 4799 }, { "epoch": 0.07167441895191094, "grad_norm": 0.40625, "grad_norm_var": 0.0010515689849853516, "learning_rate": 0.0001, "loss": 1.5212, "loss/crossentropy": 2.5142730474472046, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19307003915309906, "step": 4800 }, { "epoch": 0.07168935112252593, "grad_norm": 0.361328125, "grad_norm_var": 0.00109405517578125, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.4856473207473755, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.17517779767513275, "step": 4801 }, { "epoch": 0.07170428329314091, "grad_norm": 0.361328125, "grad_norm_var": 0.0010218302408854167, "learning_rate": 0.0001, "loss": 1.5483, "loss/crossentropy": 2.4915499687194824, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.2006581947207451, "step": 4802 }, { "epoch": 0.07171921546375588, "grad_norm": 0.412109375, "grad_norm_var": 0.0009513696034749349, "learning_rate": 0.0001, "loss": 1.5745, "loss/crossentropy": 2.4413899183273315, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.18382703512907028, "step": 4803 }, { "epoch": 0.07173414763437087, "grad_norm": 0.412109375, "grad_norm_var": 0.0009559472401936849, "learning_rate": 0.0001, "loss": 1.5367, "loss/crossentropy": 2.922303080558777, "loss/fcd": 1.34765625, "loss/idx": 9.0, "loss/logits": 0.18908153474330902, "step": 4804 }, { "epoch": 0.07174907980498585, "grad_norm": 0.40234375, "grad_norm_var": 0.0009508609771728515, "learning_rate": 0.0001, "loss": 1.4768, "loss/crossentropy": 2.5904228687286377, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.17995233833789825, "step": 4805 }, { "epoch": 0.07176401197560084, "grad_norm": 0.38671875, "grad_norm_var": 0.0009457747141520183, "learning_rate": 0.0001, "loss": 1.7873, "loss/crossentropy": 2.5835620164871216, "loss/fcd": 1.53125, "loss/idx": 9.0, "loss/logits": 0.2560271769762039, "step": 4806 }, { "epoch": 0.07177894414621581, "grad_norm": 0.337890625, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.5649, "loss/crossentropy": 2.567424774169922, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.20946507900953293, "step": 4807 }, { "epoch": 0.07179387631683079, "grad_norm": 0.33984375, "grad_norm_var": 0.0010007063547770182, "learning_rate": 0.0001, "loss": 1.406, "loss/crossentropy": 2.6690701246261597, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.17160821706056595, "step": 4808 }, { "epoch": 0.07180880848744578, "grad_norm": 0.345703125, "grad_norm_var": 0.001081085205078125, "learning_rate": 0.0001, "loss": 1.5242, "loss/crossentropy": 2.2594443559646606, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.19996970891952515, "step": 4809 }, { "epoch": 0.07182374065806076, "grad_norm": 0.392578125, "grad_norm_var": 0.0006477197011311848, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.6218146085739136, "loss/fcd": 1.328125, "loss/idx": 9.0, "loss/logits": 0.19658038765192032, "step": 4810 }, { "epoch": 0.07183867282867575, "grad_norm": 0.3359375, "grad_norm_var": 0.0007413228352864584, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.5169278383255005, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.1629364788532257, "step": 4811 }, { "epoch": 0.07185360499929072, "grad_norm": 0.369140625, "grad_norm_var": 0.0007263024648030599, "learning_rate": 0.0001, "loss": 1.5167, "loss/crossentropy": 2.598116159439087, "loss/fcd": 1.31640625, "loss/idx": 9.0, "loss/logits": 0.20026160776615143, "step": 4812 }, { "epoch": 0.0718685371699057, "grad_norm": 0.41796875, "grad_norm_var": 0.0008083184560139974, "learning_rate": 0.0001, "loss": 1.3926, "loss/crossentropy": 2.6709492206573486, "loss/fcd": 1.23046875, "loss/idx": 9.0, "loss/logits": 0.1620849370956421, "step": 4813 }, { "epoch": 0.07188346934052069, "grad_norm": 0.435546875, "grad_norm_var": 0.0010210514068603516, "learning_rate": 0.0001, "loss": 1.6286, "loss/crossentropy": 2.740757465362549, "loss/fcd": 1.390625, "loss/idx": 9.0, "loss/logits": 0.23798348754644394, "step": 4814 }, { "epoch": 0.07189840151113566, "grad_norm": 0.341796875, "grad_norm_var": 0.0011153539021809896, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.4509007930755615, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.1720517948269844, "step": 4815 }, { "epoch": 0.07191333368175065, "grad_norm": 0.40234375, "grad_norm_var": 0.0011019388834635417, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 2.745737910270691, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18934935331344604, "step": 4816 }, { "epoch": 0.07192826585236563, "grad_norm": 0.373046875, "grad_norm_var": 0.001083819071451823, "learning_rate": 0.0001, "loss": 1.5512, "loss/crossentropy": 2.6033042669296265, "loss/fcd": 1.359375, "loss/idx": 9.0, "loss/logits": 0.19185706973075867, "step": 4817 }, { "epoch": 0.07194319802298062, "grad_norm": 0.365234375, "grad_norm_var": 0.0010754903157552083, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.458145260810852, "loss/fcd": 1.25390625, "loss/idx": 9.0, "loss/logits": 0.16990014165639877, "step": 4818 }, { "epoch": 0.07195813019359559, "grad_norm": 0.392578125, "grad_norm_var": 0.0010141372680664063, "learning_rate": 0.0001, "loss": 1.7345, "loss/crossentropy": 2.656064033508301, "loss/fcd": 1.49609375, "loss/idx": 9.0, "loss/logits": 0.23844347894191742, "step": 4819 }, { "epoch": 0.07197306236421057, "grad_norm": 0.376953125, "grad_norm_var": 0.00093231201171875, "learning_rate": 0.0001, "loss": 1.6205, "loss/crossentropy": 2.596254825592041, "loss/fcd": 1.39453125, "loss/idx": 9.0, "loss/logits": 0.22595258057117462, "step": 4820 }, { "epoch": 0.07198799453482556, "grad_norm": 0.4453125, "grad_norm_var": 0.0011987686157226562, "learning_rate": 0.0001, "loss": 1.6505, "loss/crossentropy": 2.438060998916626, "loss/fcd": 1.44921875, "loss/idx": 9.0, "loss/logits": 0.2012333646416664, "step": 4821 }, { "epoch": 0.07200292670544053, "grad_norm": 0.34765625, "grad_norm_var": 0.0012521743774414062, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.3376808166503906, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.21907245367765427, "step": 4822 }, { "epoch": 0.07201785887605552, "grad_norm": 0.43359375, "grad_norm_var": 0.0013355096181233725, "learning_rate": 0.0001, "loss": 1.5651, "loss/crossentropy": 2.4630017280578613, "loss/fcd": 1.3671875, "loss/idx": 9.0, "loss/logits": 0.19790121167898178, "step": 4823 }, { "epoch": 0.0720327910466705, "grad_norm": 0.474609375, "grad_norm_var": 0.0017094930013020833, "learning_rate": 0.0001, "loss": 1.6397, "loss/crossentropy": 2.2122609615325928, "loss/fcd": 1.44140625, "loss/idx": 9.0, "loss/logits": 0.19830961525440216, "step": 4824 }, { "epoch": 0.07204772321728548, "grad_norm": 0.38671875, "grad_norm_var": 0.0015689690907796225, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.557101607322693, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.17988872528076172, "step": 4825 }, { "epoch": 0.07206265538790046, "grad_norm": 0.353515625, "grad_norm_var": 0.0016675154368082682, "learning_rate": 0.0001, "loss": 1.4586, "loss/crossentropy": 2.535897135734558, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18512311577796936, "step": 4826 }, { "epoch": 0.07207758755851544, "grad_norm": 0.3515625, "grad_norm_var": 0.0015685876210530599, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.4564712047576904, "loss/fcd": 1.24609375, "loss/idx": 9.0, "loss/logits": 0.16594240069389343, "step": 4827 }, { "epoch": 0.07209251972913043, "grad_norm": 0.6328125, "grad_norm_var": 0.005119832356770834, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.7121591567993164, "loss/fcd": 1.32421875, "loss/idx": 9.0, "loss/logits": 0.22898587584495544, "step": 4828 }, { "epoch": 0.0721074518997454, "grad_norm": 0.349609375, "grad_norm_var": 0.005322885513305664, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.502143621444702, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.19087158143520355, "step": 4829 }, { "epoch": 0.07212238407036038, "grad_norm": 0.380859375, "grad_norm_var": 0.005279270807902018, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.7452410459518433, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.18502889573574066, "step": 4830 }, { "epoch": 0.07213731624097537, "grad_norm": 0.408203125, "grad_norm_var": 0.005035003026326497, "learning_rate": 0.0001, "loss": 1.5929, "loss/crossentropy": 2.520404815673828, "loss/fcd": 1.38671875, "loss/idx": 9.0, "loss/logits": 0.20615727454423904, "step": 4831 }, { "epoch": 0.07215224841159035, "grad_norm": 0.359375, "grad_norm_var": 0.0051636854807535805, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.629156470298767, "loss/fcd": 1.30859375, "loss/idx": 9.0, "loss/logits": 0.18195407092571259, "step": 4832 }, { "epoch": 0.07216718058220534, "grad_norm": 0.33984375, "grad_norm_var": 0.005360666910807292, "learning_rate": 0.0001, "loss": 1.4651, "loss/crossentropy": 2.423100233078003, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.17605937272310257, "step": 4833 }, { "epoch": 0.07218211275282031, "grad_norm": 0.36328125, "grad_norm_var": 0.005369933446248373, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.4280236959457397, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.17869530618190765, "step": 4834 }, { "epoch": 0.07219704492343529, "grad_norm": 0.494140625, "grad_norm_var": 0.005917088190714518, "learning_rate": 0.0001, "loss": 1.6223, "loss/crossentropy": 2.4465311765670776, "loss/fcd": 1.4296875, "loss/idx": 9.0, "loss/logits": 0.19265686720609665, "step": 4835 }, { "epoch": 0.07221197709405028, "grad_norm": 0.380859375, "grad_norm_var": 0.005902846654256185, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.551528811454773, "loss/fcd": 1.234375, "loss/idx": 9.0, "loss/logits": 0.1638401821255684, "step": 4836 }, { "epoch": 0.07222690926466525, "grad_norm": 0.43359375, "grad_norm_var": 0.005850585301717123, "learning_rate": 0.0001, "loss": 1.702, "loss/crossentropy": 2.5917187929153442, "loss/fcd": 1.47265625, "loss/idx": 9.0, "loss/logits": 0.22934884577989578, "step": 4837 }, { "epoch": 0.07224184143528024, "grad_norm": 0.318359375, "grad_norm_var": 0.006130727132161459, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.4258724451065063, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.18158718198537827, "step": 4838 }, { "epoch": 0.07225677360589522, "grad_norm": 0.34375, "grad_norm_var": 0.006278419494628906, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.6647961139678955, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.16521456092596054, "step": 4839 }, { "epoch": 0.07227170577651021, "grad_norm": 0.3515625, "grad_norm_var": 0.005971002578735352, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.3735220432281494, "loss/fcd": 1.2890625, "loss/idx": 9.0, "loss/logits": 0.16372499614953995, "step": 4840 }, { "epoch": 0.07228663794712518, "grad_norm": 0.365234375, "grad_norm_var": 0.006010691324869792, "learning_rate": 0.0001, "loss": 1.6217, "loss/crossentropy": 2.25876122713089, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.1959071010351181, "step": 4841 }, { "epoch": 0.07230157011774016, "grad_norm": 0.37109375, "grad_norm_var": 0.005946461359659831, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.6227822303771973, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.1727655678987503, "step": 4842 }, { "epoch": 0.07231650228835515, "grad_norm": 0.359375, "grad_norm_var": 0.005909967422485352, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.4832202196121216, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.1813843697309494, "step": 4843 }, { "epoch": 0.07233143445897013, "grad_norm": 0.4140625, "grad_norm_var": 0.0018404483795166015, "learning_rate": 0.0001, "loss": 1.52, "loss/crossentropy": 2.5653443336486816, "loss/fcd": 1.33984375, "loss/idx": 9.0, "loss/logits": 0.18018826842308044, "step": 4844 }, { "epoch": 0.07234636662958512, "grad_norm": 0.373046875, "grad_norm_var": 0.001788949966430664, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.4796831607818604, "loss/fcd": 1.41796875, "loss/idx": 9.0, "loss/logits": 0.25299327075481415, "step": 4845 }, { "epoch": 0.07236129880020009, "grad_norm": 0.486328125, "grad_norm_var": 0.0025167942047119142, "learning_rate": 0.0001, "loss": 1.6649, "loss/crossentropy": 2.906501889228821, "loss/fcd": 1.42578125, "loss/idx": 9.0, "loss/logits": 0.23907707631587982, "step": 4846 }, { "epoch": 0.07237623097081507, "grad_norm": 0.369140625, "grad_norm_var": 0.0024919986724853517, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.6066662073135376, "loss/fcd": 1.3125, "loss/idx": 9.0, "loss/logits": 0.1891603097319603, "step": 4847 }, { "epoch": 0.07239116314143006, "grad_norm": 0.345703125, "grad_norm_var": 0.0025461832682291668, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.6311315298080444, "loss/fcd": 1.25, "loss/idx": 9.0, "loss/logits": 0.17530765384435654, "step": 4848 }, { "epoch": 0.07240609531204503, "grad_norm": 0.359375, "grad_norm_var": 0.002460670471191406, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.50286865234375, "loss/fcd": 1.26171875, "loss/idx": 9.0, "loss/logits": 0.17512854933738708, "step": 4849 }, { "epoch": 0.07242102748266002, "grad_norm": 0.486328125, "grad_norm_var": 0.0030825138092041016, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.531716227531433, "loss/fcd": 1.48828125, "loss/idx": 9.0, "loss/logits": 0.24259446561336517, "step": 4850 }, { "epoch": 0.072435959653275, "grad_norm": 0.380859375, "grad_norm_var": 0.0023228804270426433, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.4183924198150635, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.203449048101902, "step": 4851 }, { "epoch": 0.07245089182388997, "grad_norm": 0.37109375, "grad_norm_var": 0.0023324966430664064, "learning_rate": 0.0001, "loss": 1.4227, "loss/crossentropy": 2.6517646312713623, "loss/fcd": 1.2578125, "loss/idx": 9.0, "loss/logits": 0.16484073549509048, "step": 4852 }, { "epoch": 0.07246582399450496, "grad_norm": 0.359375, "grad_norm_var": 0.002176666259765625, "learning_rate": 0.0001, "loss": 1.5177, "loss/crossentropy": 2.537769079208374, "loss/fcd": 1.3203125, "loss/idx": 9.0, "loss/logits": 0.19735842943191528, "step": 4853 }, { "epoch": 0.07248075616511994, "grad_norm": 0.404296875, "grad_norm_var": 0.0019500732421875, "learning_rate": 0.0001, "loss": 1.5748, "loss/crossentropy": 2.521943688392639, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.1959160640835762, "step": 4854 }, { "epoch": 0.07249568833573493, "grad_norm": 0.4296875, "grad_norm_var": 0.0019528706868489583, "learning_rate": 0.0001, "loss": 1.6098, "loss/crossentropy": 2.623442769050598, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.2035784125328064, "step": 4855 }, { "epoch": 0.0725106205063499, "grad_norm": 0.376953125, "grad_norm_var": 0.0018658796946207682, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.6627360582351685, "loss/fcd": 1.2734375, "loss/idx": 9.0, "loss/logits": 0.17238369584083557, "step": 4856 }, { "epoch": 0.0725255526769649, "grad_norm": 0.38671875, "grad_norm_var": 0.0018216451009114584, "learning_rate": 0.0001, "loss": 1.5477, "loss/crossentropy": 2.6159472465515137, "loss/fcd": 1.35546875, "loss/idx": 9.0, "loss/logits": 0.19227948784828186, "step": 4857 }, { "epoch": 0.07254048484757987, "grad_norm": 0.376953125, "grad_norm_var": 0.001807387669881185, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.559147596359253, "loss/fcd": 1.41015625, "loss/idx": 9.0, "loss/logits": 0.1973259299993515, "step": 4858 }, { "epoch": 0.07255541701819485, "grad_norm": 0.396484375, "grad_norm_var": 0.0017297744750976562, "learning_rate": 0.0001, "loss": 1.6081, "loss/crossentropy": 2.641115188598633, "loss/fcd": 1.40625, "loss/idx": 9.0, "loss/logits": 0.20188166946172714, "step": 4859 }, { "epoch": 0.07257034918880984, "grad_norm": 0.34765625, "grad_norm_var": 0.0018346150716145833, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.512998938560486, "loss/fcd": 1.4140625, "loss/idx": 9.0, "loss/logits": 0.19392171502113342, "step": 4860 }, { "epoch": 0.07258528135942481, "grad_norm": 0.345703125, "grad_norm_var": 0.0019454320271809896, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.807400345802307, "loss/fcd": 1.27734375, "loss/idx": 9.0, "loss/logits": 0.18979384005069733, "step": 4861 }, { "epoch": 0.0726002135300398, "grad_norm": 0.3984375, "grad_norm_var": 0.0012866814931233725, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.5085930824279785, "loss/fcd": 1.3515625, "loss/idx": 9.0, "loss/logits": 0.1932726949453354, "step": 4862 }, { "epoch": 0.07261514570065478, "grad_norm": 0.357421875, "grad_norm_var": 0.0013175805409749348, "learning_rate": 0.0001, "loss": 1.4777, "loss/crossentropy": 2.490805983543396, "loss/fcd": 1.30078125, "loss/idx": 9.0, "loss/logits": 0.17696363478899002, "step": 4863 }, { "epoch": 0.07263007787126975, "grad_norm": 0.3359375, "grad_norm_var": 0.0013717015584309895, "learning_rate": 0.0001, "loss": 1.4036, "loss/crossentropy": 2.5436906814575195, "loss/fcd": 1.23828125, "loss/idx": 9.0, "loss/logits": 0.16528549790382385, "step": 4864 }, { "epoch": 0.07264501004188474, "grad_norm": 0.37109375, "grad_norm_var": 0.0013448079427083334, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.535715341567993, "loss/fcd": 1.33203125, "loss/idx": 9.0, "loss/logits": 0.19078505039215088, "step": 4865 }, { "epoch": 0.07265994221249972, "grad_norm": 0.37890625, "grad_norm_var": 0.0005833784739176433, "learning_rate": 0.0001, "loss": 1.4763, "loss/crossentropy": 2.613827109336853, "loss/fcd": 1.29296875, "loss/idx": 9.0, "loss/logits": 0.18335076421499252, "step": 4866 }, { "epoch": 0.07267487438311471, "grad_norm": 0.33203125, "grad_norm_var": 0.0007013956705729167, "learning_rate": 0.0001, "loss": 1.5761, "loss/crossentropy": 2.595254898071289, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.2049889862537384, "step": 4867 }, { "epoch": 0.07268980655372968, "grad_norm": 0.328125, "grad_norm_var": 0.0008279800415039063, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.7691646814346313, "loss/fcd": 1.2265625, "loss/idx": 9.0, "loss/logits": 0.1799296885728836, "step": 4868 }, { "epoch": 0.07270473872434466, "grad_norm": 0.33984375, "grad_norm_var": 0.00088043212890625, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.5410443544387817, "loss/fcd": 1.37109375, "loss/idx": 9.0, "loss/logits": 0.18438023328781128, "step": 4869 }, { "epoch": 0.07271967089495965, "grad_norm": 0.3984375, "grad_norm_var": 0.000855112075805664, "learning_rate": 0.0001, "loss": 1.4837, "loss/crossentropy": 2.5383495092391968, "loss/fcd": 1.296875, "loss/idx": 9.0, "loss/logits": 0.18687140196561813, "step": 4870 }, { "epoch": 0.07273460306557462, "grad_norm": 0.52734375, "grad_norm_var": 0.002244297663370768, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.497811794281006, "loss/fcd": 1.26953125, "loss/idx": 9.0, "loss/logits": 0.17946356534957886, "step": 4871 }, { "epoch": 0.07274953523618961, "grad_norm": 0.46875, "grad_norm_var": 0.002796363830566406, "learning_rate": 0.0001, "loss": 1.5998, "loss/crossentropy": 2.6234577894210815, "loss/fcd": 1.37890625, "loss/idx": 9.0, "loss/logits": 0.22090879082679749, "step": 4872 }, { "epoch": 0.07276446740680459, "grad_norm": 0.98046875, "grad_norm_var": 0.02531325022379557, "learning_rate": 0.0001, "loss": 1.8431, "loss/crossentropy": 2.189985990524292, "loss/fcd": 1.640625, "loss/idx": 9.0, "loss/logits": 0.20247850567102432, "step": 4873 }, { "epoch": 0.07277939957741957, "grad_norm": 0.392578125, "grad_norm_var": 0.025243568420410156, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.722703456878662, "loss/fcd": 1.19921875, "loss/idx": 9.0, "loss/logits": 0.15243849903345108, "step": 4874 }, { "epoch": 0.07279433174803455, "grad_norm": 0.263671875, "grad_norm_var": 0.02673943837483724, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.6596176624298096, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.19037597626447678, "step": 4875 }, { "epoch": 0.07280926391864953, "grad_norm": 0.244140625, "grad_norm_var": 0.028275156021118165, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 2.541937232017517, "loss/fcd": 1.1796875, "loss/idx": 9.5, "loss/logits": 0.1788657009601593, "step": 4876 }, { "epoch": 0.07282419608926452, "grad_norm": 0.24609375, "grad_norm_var": 0.029668617248535156, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 2.467716932296753, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.1909959316253662, "step": 4877 }, { "epoch": 0.0728391282598795, "grad_norm": 0.232421875, "grad_norm_var": 0.03137497901916504, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.7378417253494263, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.19409243762493134, "step": 4878 }, { "epoch": 0.07285406043049449, "grad_norm": 0.2333984375, "grad_norm_var": 0.032830901940663654, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.7443394660949707, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.18279416859149933, "step": 4879 }, { "epoch": 0.07286899260110946, "grad_norm": 0.2734375, "grad_norm_var": 0.03343871037165324, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.481309175491333, "loss/fcd": 1.296875, "loss/idx": 9.5, "loss/logits": 0.21819666028022766, "step": 4880 }, { "epoch": 0.07288392477172444, "grad_norm": 0.248046875, "grad_norm_var": 0.03446009556452433, "learning_rate": 0.0001, "loss": 1.3782, "loss/crossentropy": 2.741632580757141, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.18284394592046738, "step": 4881 }, { "epoch": 0.07289885694233943, "grad_norm": 0.279296875, "grad_norm_var": 0.034935120741526285, "learning_rate": 0.0001, "loss": 1.5136, "loss/crossentropy": 2.752307653427124, "loss/fcd": 1.2890625, "loss/idx": 9.5, "loss/logits": 0.2245517298579216, "step": 4882 }, { "epoch": 0.0729137891129544, "grad_norm": 0.255859375, "grad_norm_var": 0.03559964100519816, "learning_rate": 0.0001, "loss": 1.4877, "loss/crossentropy": 2.6740591526031494, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.2220396175980568, "step": 4883 }, { "epoch": 0.07292872128356939, "grad_norm": 0.240234375, "grad_norm_var": 0.036420754591623944, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.507956027984619, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.19725660234689713, "step": 4884 }, { "epoch": 0.07294365345418437, "grad_norm": 0.23828125, "grad_norm_var": 0.03722330331802368, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.449563980102539, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.18903236091136932, "step": 4885 }, { "epoch": 0.07295858562479934, "grad_norm": 0.259765625, "grad_norm_var": 0.03743997812271118, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.314681887626648, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.2037796601653099, "step": 4886 }, { "epoch": 0.07297351779541433, "grad_norm": 0.294921875, "grad_norm_var": 0.03490164677302043, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.9358073472976685, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.20386315137147903, "step": 4887 }, { "epoch": 0.07298844996602931, "grad_norm": 0.23828125, "grad_norm_var": 0.03371066649754842, "learning_rate": 0.0001, "loss": 1.3303, "loss/crossentropy": 2.367699146270752, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.17018291354179382, "step": 4888 }, { "epoch": 0.0730033821366443, "grad_norm": 0.279296875, "grad_norm_var": 0.0015279730161031088, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.52450168132782, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.17908194661140442, "step": 4889 }, { "epoch": 0.07301831430725927, "grad_norm": 0.2734375, "grad_norm_var": 0.00036836862564086913, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.615095853805542, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.18703532218933105, "step": 4890 }, { "epoch": 0.07303324647787425, "grad_norm": 0.22265625, "grad_norm_var": 0.00043312311172485354, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.4338220357894897, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.19485481828451157, "step": 4891 }, { "epoch": 0.07304817864848924, "grad_norm": 0.287109375, "grad_norm_var": 0.0004936178525288899, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.7154897451400757, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.20275069773197174, "step": 4892 }, { "epoch": 0.07306311081910422, "grad_norm": 0.31640625, "grad_norm_var": 0.0007059057553609212, "learning_rate": 0.0001, "loss": 1.6713, "loss/crossentropy": 2.918621063232422, "loss/fcd": 1.359375, "loss/idx": 9.5, "loss/logits": 0.31195420771837234, "step": 4893 }, { "epoch": 0.0730780429897192, "grad_norm": 0.2197265625, "grad_norm_var": 0.0007640202840169271, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.554708957672119, "loss/fcd": 1.140625, "loss/idx": 9.5, "loss/logits": 0.17513103038072586, "step": 4894 }, { "epoch": 0.07309297516033418, "grad_norm": 0.2451171875, "grad_norm_var": 0.0007310231526692708, "learning_rate": 0.0001, "loss": 1.498, "loss/crossentropy": 2.5123186111450195, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.21675753593444824, "step": 4895 }, { "epoch": 0.07310790733094916, "grad_norm": 0.27734375, "grad_norm_var": 0.0007385889689127604, "learning_rate": 0.0001, "loss": 1.4584, "loss/crossentropy": 2.6344499588012695, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.1966942697763443, "step": 4896 }, { "epoch": 0.07312283950156415, "grad_norm": 0.2431640625, "grad_norm_var": 0.0007485032081604003, "learning_rate": 0.0001, "loss": 1.27, "loss/crossentropy": 2.6999019384384155, "loss/fcd": 1.10546875, "loss/idx": 9.5, "loss/logits": 0.164500392973423, "step": 4897 }, { "epoch": 0.07313777167217912, "grad_norm": 0.248046875, "grad_norm_var": 0.0007319728533426921, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.422290802001953, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.19868811964988708, "step": 4898 }, { "epoch": 0.07315270384279411, "grad_norm": 0.240234375, "grad_norm_var": 0.0007532080014546712, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.602862000465393, "loss/fcd": 1.16796875, "loss/idx": 9.5, "loss/logits": 0.15851373225450516, "step": 4899 }, { "epoch": 0.07316763601340909, "grad_norm": 0.251953125, "grad_norm_var": 0.0007344206174214681, "learning_rate": 0.0001, "loss": 1.4051, "loss/crossentropy": 2.344017505645752, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.17460446804761887, "step": 4900 }, { "epoch": 0.07318256818402408, "grad_norm": 0.23828125, "grad_norm_var": 0.0007344206174214681, "learning_rate": 0.0001, "loss": 1.2927, "loss/crossentropy": 2.6633646488189697, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.16772408038377762, "step": 4901 }, { "epoch": 0.07319750035463905, "grad_norm": 0.244140625, "grad_norm_var": 0.0007470091183980306, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.5297025442123413, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.20157784968614578, "step": 4902 }, { "epoch": 0.07321243252525403, "grad_norm": 0.31640625, "grad_norm_var": 0.0008830348650614421, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.535112738609314, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.20555371046066284, "step": 4903 }, { "epoch": 0.07322736469586902, "grad_norm": 0.25390625, "grad_norm_var": 0.0008554418881734212, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.8103857040405273, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.22478019446134567, "step": 4904 }, { "epoch": 0.073242296866484, "grad_norm": 0.2392578125, "grad_norm_var": 0.0008516947428385417, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.5439900159835815, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.17738018184900284, "step": 4905 }, { "epoch": 0.07325722903709898, "grad_norm": 0.2333984375, "grad_norm_var": 0.0008658687273661296, "learning_rate": 0.0001, "loss": 1.5368, "loss/crossentropy": 2.4361467361450195, "loss/fcd": 1.3046875, "loss/idx": 9.5, "loss/logits": 0.23210851848125458, "step": 4906 }, { "epoch": 0.07327216120771396, "grad_norm": 0.345703125, "grad_norm_var": 0.0012844363848368327, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.4709609746932983, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.19634266197681427, "step": 4907 }, { "epoch": 0.07328709337832894, "grad_norm": 0.2470703125, "grad_norm_var": 0.0012533187866210938, "learning_rate": 0.0001, "loss": 1.3732, "loss/crossentropy": 2.653868079185486, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.17008209228515625, "step": 4908 }, { "epoch": 0.07330202554894392, "grad_norm": 0.28515625, "grad_norm_var": 0.0010793685913085937, "learning_rate": 0.0001, "loss": 1.4831, "loss/crossentropy": 2.538804531097412, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.1900831088423729, "step": 4909 }, { "epoch": 0.0733169577195589, "grad_norm": 0.25, "grad_norm_var": 0.0009819308916727701, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.7143219709396362, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.20173430442810059, "step": 4910 }, { "epoch": 0.07333188989017389, "grad_norm": 0.2333984375, "grad_norm_var": 0.0010136882464090982, "learning_rate": 0.0001, "loss": 1.3628, "loss/crossentropy": 2.6093517541885376, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.19091062992811203, "step": 4911 }, { "epoch": 0.07334682206078887, "grad_norm": 0.2353515625, "grad_norm_var": 0.0010224024454752604, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.8313961029052734, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.20720559358596802, "step": 4912 }, { "epoch": 0.07336175423140384, "grad_norm": 0.26171875, "grad_norm_var": 0.0010107000668843587, "learning_rate": 0.0001, "loss": 1.3313, "loss/crossentropy": 2.4325836896896362, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.1672002449631691, "step": 4913 }, { "epoch": 0.07337668640201883, "grad_norm": 0.30859375, "grad_norm_var": 0.0011614759763081868, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.6259422302246094, "loss/fcd": 1.33984375, "loss/idx": 9.5, "loss/logits": 0.21552012860774994, "step": 4914 }, { "epoch": 0.07339161857263381, "grad_norm": 0.25390625, "grad_norm_var": 0.0011343280474344888, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.5178741216659546, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.19598689675331116, "step": 4915 }, { "epoch": 0.0734065507432488, "grad_norm": 0.447265625, "grad_norm_var": 0.003246716658274333, "learning_rate": 0.0001, "loss": 2.243, "loss/crossentropy": 2.573517918586731, "loss/fcd": 1.91015625, "loss/idx": 9.5, "loss/logits": 0.3328510671854019, "step": 4916 }, { "epoch": 0.07342148291386377, "grad_norm": 0.26953125, "grad_norm_var": 0.003156435489654541, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.620609760284424, "loss/fcd": 1.12109375, "loss/idx": 9.5, "loss/logits": 0.15967822074890137, "step": 4917 }, { "epoch": 0.07343641508447875, "grad_norm": 0.236328125, "grad_norm_var": 0.003194010257720947, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.5793557167053223, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.1588720604777336, "step": 4918 }, { "epoch": 0.07345134725509374, "grad_norm": 0.27734375, "grad_norm_var": 0.0030792514483133954, "learning_rate": 0.0001, "loss": 1.4944, "loss/crossentropy": 2.5939602851867676, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.2014525905251503, "step": 4919 }, { "epoch": 0.07346627942570871, "grad_norm": 0.2197265625, "grad_norm_var": 0.0032421112060546874, "learning_rate": 0.0001, "loss": 1.3523, "loss/crossentropy": 2.525184154510498, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.18819276243448257, "step": 4920 }, { "epoch": 0.0734812115963237, "grad_norm": 0.24609375, "grad_norm_var": 0.0032156586647033693, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.4632328748703003, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.17724600434303284, "step": 4921 }, { "epoch": 0.07349614376693868, "grad_norm": 0.23828125, "grad_norm_var": 0.0031920750935872395, "learning_rate": 0.0001, "loss": 1.3335, "loss/crossentropy": 2.5373159646987915, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.1576797068119049, "step": 4922 }, { "epoch": 0.07351107593755367, "grad_norm": 0.25390625, "grad_norm_var": 0.0028192996978759766, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.618391752243042, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.21972167491912842, "step": 4923 }, { "epoch": 0.07352600810816864, "grad_norm": 0.2578125, "grad_norm_var": 0.0027987122535705566, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.575442314147949, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.1999652311205864, "step": 4924 }, { "epoch": 0.07354094027878362, "grad_norm": 0.205078125, "grad_norm_var": 0.0030072490374247235, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.415941834449768, "loss/fcd": 1.140625, "loss/idx": 9.5, "loss/logits": 0.1804439052939415, "step": 4925 }, { "epoch": 0.07355587244939861, "grad_norm": 0.2421875, "grad_norm_var": 0.003023715813954671, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.331746459007263, "loss/fcd": 1.2890625, "loss/idx": 9.5, "loss/logits": 0.20258602499961853, "step": 4926 }, { "epoch": 0.07357080462001359, "grad_norm": 0.2890625, "grad_norm_var": 0.0030076344807942706, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.2973849773406982, "loss/fcd": 1.296875, "loss/idx": 9.5, "loss/logits": 0.20662497729063034, "step": 4927 }, { "epoch": 0.07358573679062858, "grad_norm": 0.2470703125, "grad_norm_var": 0.002969678243001302, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.63884174823761, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.1796058788895607, "step": 4928 }, { "epoch": 0.07360066896124355, "grad_norm": 0.2734375, "grad_norm_var": 0.0029717763264973957, "learning_rate": 0.0001, "loss": 1.5277, "loss/crossentropy": 2.538954496383667, "loss/fcd": 1.32421875, "loss/idx": 9.5, "loss/logits": 0.20343317091464996, "step": 4929 }, { "epoch": 0.07361560113185853, "grad_norm": 0.220703125, "grad_norm_var": 0.0029624780019124348, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.649017810821533, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.18254593014717102, "step": 4930 }, { "epoch": 0.07363053330247352, "grad_norm": 0.296875, "grad_norm_var": 0.003036610285441081, "learning_rate": 0.0001, "loss": 1.6554, "loss/crossentropy": 2.5550791025161743, "loss/fcd": 1.43359375, "loss/idx": 9.5, "loss/logits": 0.22176840156316757, "step": 4931 }, { "epoch": 0.07364546547308849, "grad_norm": 0.25390625, "grad_norm_var": 0.0006432215372721354, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.6104464530944824, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.20774635672569275, "step": 4932 }, { "epoch": 0.07366039764370348, "grad_norm": 0.2138671875, "grad_norm_var": 0.0007046024004618327, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.5982483625411987, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.19097516685724258, "step": 4933 }, { "epoch": 0.07367532981431846, "grad_norm": 0.2236328125, "grad_norm_var": 0.0007348219553629557, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.644276976585388, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.1709606721997261, "step": 4934 }, { "epoch": 0.07369026198493343, "grad_norm": 0.2236328125, "grad_norm_var": 0.0007009466489156087, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.577437162399292, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.21222001314163208, "step": 4935 }, { "epoch": 0.07370519415554842, "grad_norm": 0.294921875, "grad_norm_var": 0.0008101781209309896, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.693794846534729, "loss/fcd": 1.3203125, "loss/idx": 9.5, "loss/logits": 0.22888051718473434, "step": 4936 }, { "epoch": 0.0737201263261634, "grad_norm": 0.2421875, "grad_norm_var": 0.000812530517578125, "learning_rate": 0.0001, "loss": 1.3303, "loss/crossentropy": 2.6171709299087524, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.18190811574459076, "step": 4937 }, { "epoch": 0.07373505849677839, "grad_norm": 0.234375, "grad_norm_var": 0.0008188247680664063, "learning_rate": 0.0001, "loss": 1.443, "loss/crossentropy": 2.285043239593506, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.20076430588960648, "step": 4938 }, { "epoch": 0.07374999066739336, "grad_norm": 0.3515625, "grad_norm_var": 0.0014879862467447917, "learning_rate": 0.0001, "loss": 1.4927, "loss/crossentropy": 2.5083372592926025, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.21537843346595764, "step": 4939 }, { "epoch": 0.07376492283800835, "grad_norm": 0.216796875, "grad_norm_var": 0.0015744368235270183, "learning_rate": 0.0001, "loss": 1.2773, "loss/crossentropy": 2.6706401109695435, "loss/fcd": 1.12109375, "loss/idx": 9.5, "loss/logits": 0.15625548362731934, "step": 4940 }, { "epoch": 0.07377985500862333, "grad_norm": 0.25390625, "grad_norm_var": 0.0014190673828125, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.8338024616241455, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.24320732802152634, "step": 4941 }, { "epoch": 0.0737947871792383, "grad_norm": 0.2578125, "grad_norm_var": 0.0014078776041666667, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.6376057863235474, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.20468927174806595, "step": 4942 }, { "epoch": 0.0738097193498533, "grad_norm": 0.2578125, "grad_norm_var": 0.00133056640625, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.6811572313308716, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.19700688868761063, "step": 4943 }, { "epoch": 0.07382465152046827, "grad_norm": 0.25, "grad_norm_var": 0.001328432559967041, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.3179229497909546, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.17236290127038956, "step": 4944 }, { "epoch": 0.07383958369108326, "grad_norm": 0.2265625, "grad_norm_var": 0.0013448357582092284, "learning_rate": 0.0001, "loss": 1.28, "loss/crossentropy": 2.4871491193771362, "loss/fcd": 1.12109375, "loss/idx": 9.5, "loss/logits": 0.15888357907533646, "step": 4945 }, { "epoch": 0.07385451586169824, "grad_norm": 0.25390625, "grad_norm_var": 0.0012789050738016765, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.811417818069458, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.20054633915424347, "step": 4946 }, { "epoch": 0.07386944803231321, "grad_norm": 0.267578125, "grad_norm_var": 0.0011620799700419108, "learning_rate": 0.0001, "loss": 1.554, "loss/crossentropy": 2.5414501428604126, "loss/fcd": 1.33203125, "loss/idx": 9.5, "loss/logits": 0.2219327688217163, "step": 4947 }, { "epoch": 0.0738843802029282, "grad_norm": 0.25390625, "grad_norm_var": 0.0011620799700419108, "learning_rate": 0.0001, "loss": 1.5211, "loss/crossentropy": 2.490052342414856, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.19293705374002457, "step": 4948 }, { "epoch": 0.07389931237354318, "grad_norm": 0.2216796875, "grad_norm_var": 0.0011267940203348796, "learning_rate": 0.0001, "loss": 1.5334, "loss/crossentropy": 2.2880897521972656, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.25602225959300995, "step": 4949 }, { "epoch": 0.07391424454415817, "grad_norm": 0.396484375, "grad_norm_var": 0.002342859903971354, "learning_rate": 0.0001, "loss": 1.6321, "loss/crossentropy": 2.5283336639404297, "loss/fcd": 1.37109375, "loss/idx": 9.5, "loss/logits": 0.2609601393342018, "step": 4950 }, { "epoch": 0.07392917671477314, "grad_norm": 0.234375, "grad_norm_var": 0.0022941231727600098, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.6567234992980957, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.16530242562294006, "step": 4951 }, { "epoch": 0.07394410888538812, "grad_norm": 0.322265625, "grad_norm_var": 0.00245589812596639, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.713452458381653, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.1940694972872734, "step": 4952 }, { "epoch": 0.07395904105600311, "grad_norm": 0.244140625, "grad_norm_var": 0.0024501760800679524, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.671997308731079, "loss/fcd": 1.16796875, "loss/idx": 9.5, "loss/logits": 0.1794317215681076, "step": 4953 }, { "epoch": 0.07397397322661808, "grad_norm": 0.220703125, "grad_norm_var": 0.0025180459022521974, "learning_rate": 0.0001, "loss": 1.3358, "loss/crossentropy": 2.69063401222229, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.1873922124505043, "step": 4954 }, { "epoch": 0.07398890539723307, "grad_norm": 0.244140625, "grad_norm_var": 0.0019900282224019367, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.5552443265914917, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.19932463765144348, "step": 4955 }, { "epoch": 0.07400383756784805, "grad_norm": 0.265625, "grad_norm_var": 0.0018732031186421713, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.841711401939392, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.20591379702091217, "step": 4956 }, { "epoch": 0.07401876973846302, "grad_norm": 0.26171875, "grad_norm_var": 0.00186996062596639, "learning_rate": 0.0001, "loss": 1.4668, "loss/crossentropy": 2.5700618028640747, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.19729340076446533, "step": 4957 }, { "epoch": 0.07403370190907801, "grad_norm": 0.23046875, "grad_norm_var": 0.0019289294878641764, "learning_rate": 0.0001, "loss": 1.4622, "loss/crossentropy": 2.5206685066223145, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.2082926258444786, "step": 4958 }, { "epoch": 0.07404863407969299, "grad_norm": 0.302734375, "grad_norm_var": 0.0020451823870340983, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.75728178024292, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.22775547951459885, "step": 4959 }, { "epoch": 0.07406356625030798, "grad_norm": 0.2275390625, "grad_norm_var": 0.002113453547159831, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.6118528842926025, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.19911423325538635, "step": 4960 }, { "epoch": 0.07407849842092296, "grad_norm": 0.234375, "grad_norm_var": 0.0020815372467041016, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.543386697769165, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.22060686349868774, "step": 4961 }, { "epoch": 0.07409343059153795, "grad_norm": 0.291015625, "grad_norm_var": 0.0021307627360026044, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.812305212020874, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.18688520789146423, "step": 4962 }, { "epoch": 0.07410836276215292, "grad_norm": 0.2333984375, "grad_norm_var": 0.002185976505279541, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.731266140937805, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.20456887781620026, "step": 4963 }, { "epoch": 0.0741232949327679, "grad_norm": 0.251953125, "grad_norm_var": 0.0021882017453511557, "learning_rate": 0.0001, "loss": 1.4347, "loss/crossentropy": 2.665467858314514, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.20425113290548325, "step": 4964 }, { "epoch": 0.07413822710338289, "grad_norm": 0.2099609375, "grad_norm_var": 0.0022588690121968587, "learning_rate": 0.0001, "loss": 1.2955, "loss/crossentropy": 2.4643832445144653, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.17052753269672394, "step": 4965 }, { "epoch": 0.07415315927399786, "grad_norm": 0.25390625, "grad_norm_var": 0.0009477257728576661, "learning_rate": 0.0001, "loss": 1.3464, "loss/crossentropy": 2.5983158349990845, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.19012674689292908, "step": 4966 }, { "epoch": 0.07416809144461285, "grad_norm": 0.271484375, "grad_norm_var": 0.0009477257728576661, "learning_rate": 0.0001, "loss": 1.5854, "loss/crossentropy": 2.412498950958252, "loss/fcd": 1.359375, "loss/idx": 9.5, "loss/logits": 0.22599749267101288, "step": 4967 }, { "epoch": 0.07418302361522783, "grad_norm": 0.2578125, "grad_norm_var": 0.0006214737892150879, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.503898024559021, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.19689171016216278, "step": 4968 }, { "epoch": 0.0741979557858428, "grad_norm": 0.267578125, "grad_norm_var": 0.0006373047828674317, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.291543960571289, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.19580358266830444, "step": 4969 }, { "epoch": 0.0742128879564578, "grad_norm": 0.2275390625, "grad_norm_var": 0.0006121317545572917, "learning_rate": 0.0001, "loss": 1.4586, "loss/crossentropy": 2.4351853132247925, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.20467832684516907, "step": 4970 }, { "epoch": 0.07422782012707277, "grad_norm": 0.296875, "grad_norm_var": 0.0007310072580973307, "learning_rate": 0.0001, "loss": 1.4062, "loss/crossentropy": 2.4569783210754395, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.19528914988040924, "step": 4971 }, { "epoch": 0.07424275229768776, "grad_norm": 0.2275390625, "grad_norm_var": 0.0007689754168192545, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 2.7060126066207886, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.18672140687704086, "step": 4972 }, { "epoch": 0.07425768446830273, "grad_norm": 0.25, "grad_norm_var": 0.0007637302080790202, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.6120587587356567, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.21203159540891647, "step": 4973 }, { "epoch": 0.07427261663891771, "grad_norm": 0.25390625, "grad_norm_var": 0.0007303516070048014, "learning_rate": 0.0001, "loss": 1.57, "loss/crossentropy": 2.4641934633255005, "loss/fcd": 1.3359375, "loss/idx": 9.5, "loss/logits": 0.23402220755815506, "step": 4974 }, { "epoch": 0.0742875488095327, "grad_norm": 0.2138671875, "grad_norm_var": 0.0006417592366536458, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.597882390022278, "loss/fcd": 1.1796875, "loss/idx": 9.5, "loss/logits": 0.19540070742368698, "step": 4975 }, { "epoch": 0.07430248098014768, "grad_norm": 0.28515625, "grad_norm_var": 0.0006916960080464681, "learning_rate": 0.0001, "loss": 1.5175, "loss/crossentropy": 2.6692832708358765, "loss/fcd": 1.30859375, "loss/idx": 9.5, "loss/logits": 0.208876371383667, "step": 4976 }, { "epoch": 0.07431741315076267, "grad_norm": 0.244140625, "grad_norm_var": 0.0006751656532287597, "learning_rate": 0.0001, "loss": 1.4117, "loss/crossentropy": 2.5554479360580444, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.18908847868442535, "step": 4977 }, { "epoch": 0.07433234532137764, "grad_norm": 0.279296875, "grad_norm_var": 0.0006231904029846191, "learning_rate": 0.0001, "loss": 1.4121, "loss/crossentropy": 2.657364249229431, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.19332724064588547, "step": 4978 }, { "epoch": 0.07434727749199262, "grad_norm": 0.28515625, "grad_norm_var": 0.0006655216217041016, "learning_rate": 0.0001, "loss": 1.5281, "loss/crossentropy": 2.4764440059661865, "loss/fcd": 1.3203125, "loss/idx": 9.5, "loss/logits": 0.20775946974754333, "step": 4979 }, { "epoch": 0.0743622096626076, "grad_norm": 0.236328125, "grad_norm_var": 0.0006866296132405599, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.4341593980789185, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.21057672053575516, "step": 4980 }, { "epoch": 0.07437714183322258, "grad_norm": 0.23828125, "grad_norm_var": 0.000571278731028239, "learning_rate": 0.0001, "loss": 1.3359, "loss/crossentropy": 2.589960217475891, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.17969129234552383, "step": 4981 }, { "epoch": 0.07439207400383757, "grad_norm": 0.318359375, "grad_norm_var": 0.0008167545000712077, "learning_rate": 0.0001, "loss": 1.9899, "loss/crossentropy": 2.621371865272522, "loss/fcd": 1.64453125, "loss/idx": 9.5, "loss/logits": 0.34532782435417175, "step": 4982 }, { "epoch": 0.07440700617445255, "grad_norm": 0.2578125, "grad_norm_var": 0.0008067409197489421, "learning_rate": 0.0001, "loss": 1.5014, "loss/crossentropy": 2.583021879196167, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.2201850265264511, "step": 4983 }, { "epoch": 0.07442193834506754, "grad_norm": 0.2333984375, "grad_norm_var": 0.0008469740549723307, "learning_rate": 0.0001, "loss": 1.3506, "loss/crossentropy": 2.6903263330459595, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.17872704565525055, "step": 4984 }, { "epoch": 0.07443687051568251, "grad_norm": 0.28515625, "grad_norm_var": 0.0008906046549479167, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.705268383026123, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.19879008829593658, "step": 4985 }, { "epoch": 0.07445180268629749, "grad_norm": 0.232421875, "grad_norm_var": 0.0008720676104227702, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.535316228866577, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.1739106923341751, "step": 4986 }, { "epoch": 0.07446673485691248, "grad_norm": 0.248046875, "grad_norm_var": 0.0007719318072001139, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.624149203300476, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.18376524746418, "step": 4987 }, { "epoch": 0.07448166702752745, "grad_norm": 0.21875, "grad_norm_var": 0.0008095900217692057, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.6604868173599243, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.17770003527402878, "step": 4988 }, { "epoch": 0.07449659919814244, "grad_norm": 0.2412109375, "grad_norm_var": 0.0008202830950419109, "learning_rate": 0.0001, "loss": 1.2762, "loss/crossentropy": 2.8681591749191284, "loss/fcd": 1.109375, "loss/idx": 9.5, "loss/logits": 0.16683156788349152, "step": 4989 }, { "epoch": 0.07451153136875742, "grad_norm": 0.25390625, "grad_norm_var": 0.0008202830950419109, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.5643328428268433, "loss/fcd": 1.3046875, "loss/idx": 9.5, "loss/logits": 0.20720911026000977, "step": 4990 }, { "epoch": 0.0745264635393724, "grad_norm": 0.25390625, "grad_norm_var": 0.0007037957509358724, "learning_rate": 0.0001, "loss": 1.4716, "loss/crossentropy": 2.681180000305176, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.20598550140857697, "step": 4991 }, { "epoch": 0.07454139570998738, "grad_norm": 0.267578125, "grad_norm_var": 0.0006570180257161458, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.653096556663513, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.1978471502661705, "step": 4992 }, { "epoch": 0.07455632788060236, "grad_norm": 0.251953125, "grad_norm_var": 0.0006486256917317708, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.56731915473938, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.22424253821372986, "step": 4993 }, { "epoch": 0.07457126005121735, "grad_norm": 0.2158203125, "grad_norm_var": 0.0007062236467997233, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.410842776298523, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.18044090270996094, "step": 4994 }, { "epoch": 0.07458619222183233, "grad_norm": 0.24609375, "grad_norm_var": 0.0006308833758036296, "learning_rate": 0.0001, "loss": 1.3125, "loss/crossentropy": 2.616830348968506, "loss/fcd": 1.1328125, "loss/idx": 9.5, "loss/logits": 0.17968188971281052, "step": 4995 }, { "epoch": 0.0746011243924473, "grad_norm": 0.2578125, "grad_norm_var": 0.0006207426389058431, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.678797483444214, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.20103546231985092, "step": 4996 }, { "epoch": 0.07461605656306229, "grad_norm": 0.2353515625, "grad_norm_var": 0.0006263573964436849, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.540307879447937, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.148633174598217, "step": 4997 }, { "epoch": 0.07463098873367727, "grad_norm": 0.2333984375, "grad_norm_var": 0.0003155668576558431, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.5062766075134277, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.181427501142025, "step": 4998 }, { "epoch": 0.07464592090429226, "grad_norm": 0.298828125, "grad_norm_var": 0.0004864652951558431, "learning_rate": 0.0001, "loss": 1.4842, "loss/crossentropy": 2.7391358613967896, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.2107638567686081, "step": 4999 }, { "epoch": 0.07466085307490723, "grad_norm": 0.2490234375, "grad_norm_var": 0.00047057072321573894, "learning_rate": 0.0001, "loss": 1.3125, "loss/crossentropy": 2.7885704040527344, "loss/fcd": 1.14453125, "loss/idx": 9.5, "loss/logits": 0.1679999828338623, "step": 5000 }, { "epoch": 0.07467578524552222, "grad_norm": 0.287109375, "grad_norm_var": 0.0004801392555236816, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.7082194089889526, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.18999993801116943, "step": 5001 }, { "epoch": 0.0746907174161372, "grad_norm": 0.32421875, "grad_norm_var": 0.0007983803749084473, "learning_rate": 0.0001, "loss": 1.5146, "loss/crossentropy": 2.4135167598724365, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.2216620221734047, "step": 5002 }, { "epoch": 0.07470564958675217, "grad_norm": 0.24609375, "grad_norm_var": 0.000800478458404541, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.769330859184265, "loss/fcd": 1.140625, "loss/idx": 9.5, "loss/logits": 0.17548343539237976, "step": 5003 }, { "epoch": 0.07472058175736716, "grad_norm": 0.2392578125, "grad_norm_var": 0.0007274627685546875, "learning_rate": 0.0001, "loss": 1.4985, "loss/crossentropy": 2.484681010246277, "loss/fcd": 1.28515625, "loss/idx": 9.5, "loss/logits": 0.21330367028713226, "step": 5004 }, { "epoch": 0.07473551392798214, "grad_norm": 0.24609375, "grad_norm_var": 0.0007190982500712077, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.573123574256897, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.20141397416591644, "step": 5005 }, { "epoch": 0.07475044609859713, "grad_norm": 0.26171875, "grad_norm_var": 0.000720051924387614, "learning_rate": 0.0001, "loss": 1.6642, "loss/crossentropy": 2.615808129310608, "loss/fcd": 1.38671875, "loss/idx": 9.5, "loss/logits": 0.27748409658670425, "step": 5006 }, { "epoch": 0.0747653782692121, "grad_norm": 0.2421875, "grad_norm_var": 0.0007336894671122233, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.5906643867492676, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.1902056634426117, "step": 5007 }, { "epoch": 0.07478031043982708, "grad_norm": 0.2255859375, "grad_norm_var": 0.0007813612620035808, "learning_rate": 0.0001, "loss": 1.3905, "loss/crossentropy": 2.5043623447418213, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.1991109400987625, "step": 5008 }, { "epoch": 0.07479524261044207, "grad_norm": 0.2490234375, "grad_norm_var": 0.000782612959543864, "learning_rate": 0.0001, "loss": 1.2401, "loss/crossentropy": 2.669127583503723, "loss/fcd": 1.08984375, "loss/idx": 9.5, "loss/logits": 0.15027133375406265, "step": 5009 }, { "epoch": 0.07481017478105705, "grad_norm": 0.2197265625, "grad_norm_var": 0.0007638891537984212, "learning_rate": 0.0001, "loss": 1.3793, "loss/crossentropy": 2.495224952697754, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.18397365510463715, "step": 5010 }, { "epoch": 0.07482510695167204, "grad_norm": 0.2392578125, "grad_norm_var": 0.0007738749186197917, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.5322606563568115, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.2286146879196167, "step": 5011 }, { "epoch": 0.07484003912228701, "grad_norm": 0.25, "grad_norm_var": 0.0007731119791666666, "learning_rate": 0.0001, "loss": 1.3354, "loss/crossentropy": 2.3602042198181152, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.17525430023670197, "step": 5012 }, { "epoch": 0.07485497129290199, "grad_norm": 0.25, "grad_norm_var": 0.0007521907488505045, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.617290735244751, "loss/fcd": 1.14453125, "loss/idx": 9.5, "loss/logits": 0.16875945031642914, "step": 5013 }, { "epoch": 0.07486990346351698, "grad_norm": 0.2099609375, "grad_norm_var": 0.0008504192034403483, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.3473033905029297, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.20898184180259705, "step": 5014 }, { "epoch": 0.07488483563413195, "grad_norm": 0.251953125, "grad_norm_var": 0.0006974498430887858, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.3803902864456177, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.15448611974716187, "step": 5015 }, { "epoch": 0.07489976780474694, "grad_norm": 0.2099609375, "grad_norm_var": 0.0007950425148010254, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.5621002912521362, "loss/fcd": 1.13671875, "loss/idx": 9.5, "loss/logits": 0.1815570741891861, "step": 5016 }, { "epoch": 0.07491469997536192, "grad_norm": 0.240234375, "grad_norm_var": 0.0006817460060119628, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.610450506210327, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.2082061469554901, "step": 5017 }, { "epoch": 0.0749296321459769, "grad_norm": 0.2470703125, "grad_norm_var": 0.00022939046223958333, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.520108699798584, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.17017512768507004, "step": 5018 }, { "epoch": 0.07494456431659188, "grad_norm": 0.263671875, "grad_norm_var": 0.0002647240956624349, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 2.4353092908859253, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.1891510933637619, "step": 5019 }, { "epoch": 0.07495949648720686, "grad_norm": 0.2734375, "grad_norm_var": 0.0003327329953511556, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.847368597984314, "loss/fcd": 1.33203125, "loss/idx": 9.5, "loss/logits": 0.2250385582447052, "step": 5020 }, { "epoch": 0.07497442865782185, "grad_norm": 0.2431640625, "grad_norm_var": 0.0003318627675374349, "learning_rate": 0.0001, "loss": 1.2797, "loss/crossentropy": 2.5441235303878784, "loss/fcd": 1.11328125, "loss/idx": 9.5, "loss/logits": 0.16638216376304626, "step": 5021 }, { "epoch": 0.07498936082843682, "grad_norm": 0.2275390625, "grad_norm_var": 0.00031642516454060874, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.7505040168762207, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.18131131678819656, "step": 5022 }, { "epoch": 0.07500429299905181, "grad_norm": 0.248046875, "grad_norm_var": 0.0003201444943745931, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.6191060543060303, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.16421449184417725, "step": 5023 }, { "epoch": 0.07501922516966679, "grad_norm": 0.2421875, "grad_norm_var": 0.0003042697906494141, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.5858367681503296, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.21530206501483917, "step": 5024 }, { "epoch": 0.07503415734028177, "grad_norm": 0.251953125, "grad_norm_var": 0.00030771493911743166, "learning_rate": 0.0001, "loss": 1.4612, "loss/crossentropy": 2.4953032732009888, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.1877351701259613, "step": 5025 }, { "epoch": 0.07504908951089675, "grad_norm": 0.244140625, "grad_norm_var": 0.0002732435862223307, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.479909300804138, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.21360082179307938, "step": 5026 }, { "epoch": 0.07506402168151173, "grad_norm": 0.23828125, "grad_norm_var": 0.00027382771174112955, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.4720842838287354, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.18826506286859512, "step": 5027 }, { "epoch": 0.07507895385212672, "grad_norm": 0.251953125, "grad_norm_var": 0.0002758304278055827, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.667857050895691, "loss/fcd": 1.2890625, "loss/idx": 9.5, "loss/logits": 0.2091618776321411, "step": 5028 }, { "epoch": 0.0750938860227417, "grad_norm": 0.25390625, "grad_norm_var": 0.00028024911880493165, "learning_rate": 0.0001, "loss": 1.3746, "loss/crossentropy": 2.771187424659729, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.17929986864328384, "step": 5029 }, { "epoch": 0.07510881819335667, "grad_norm": 0.26953125, "grad_norm_var": 0.0002349217732747396, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.7850500345230103, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.18599988520145416, "step": 5030 }, { "epoch": 0.07512375036397166, "grad_norm": 0.26171875, "grad_norm_var": 0.00024692217508951825, "learning_rate": 0.0001, "loss": 1.5491, "loss/crossentropy": 2.515698790550232, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.23655154556035995, "step": 5031 }, { "epoch": 0.07513868253458664, "grad_norm": 0.2890625, "grad_norm_var": 0.00023758808771769205, "learning_rate": 0.0001, "loss": 1.5478, "loss/crossentropy": 2.670578718185425, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.2352820783853531, "step": 5032 }, { "epoch": 0.07515361470520163, "grad_norm": 0.27734375, "grad_norm_var": 0.0002611438433329264, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.505023717880249, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.1890258863568306, "step": 5033 }, { "epoch": 0.0751685468758166, "grad_norm": 0.26953125, "grad_norm_var": 0.00026836395263671873, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.3657586574554443, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.2451471984386444, "step": 5034 }, { "epoch": 0.07518347904643158, "grad_norm": 0.259765625, "grad_norm_var": 0.00026563008626302086, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.7120033502578735, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.19889283925294876, "step": 5035 }, { "epoch": 0.07519841121704657, "grad_norm": 0.251953125, "grad_norm_var": 0.00024552345275878905, "learning_rate": 0.0001, "loss": 1.4107, "loss/crossentropy": 2.8484710454940796, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.19976259768009186, "step": 5036 }, { "epoch": 0.07521334338766154, "grad_norm": 0.24609375, "grad_norm_var": 0.00024143457412719726, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.493568539619446, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.16735904663801193, "step": 5037 }, { "epoch": 0.07522827555827653, "grad_norm": 0.244140625, "grad_norm_var": 0.00019745826721191407, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.2602596282958984, "loss/fcd": 1.37109375, "loss/idx": 9.5, "loss/logits": 0.22627072036266327, "step": 5038 }, { "epoch": 0.07524320772889151, "grad_norm": 0.216796875, "grad_norm_var": 0.0002925713857014974, "learning_rate": 0.0001, "loss": 1.2353, "loss/crossentropy": 2.6225202083587646, "loss/fcd": 1.08203125, "loss/idx": 9.5, "loss/logits": 0.15329130738973618, "step": 5039 }, { "epoch": 0.07525813989950648, "grad_norm": 0.2421875, "grad_norm_var": 0.0002925713857014974, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.719053864479065, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.19307812303304672, "step": 5040 }, { "epoch": 0.07527307207012147, "grad_norm": 0.2490234375, "grad_norm_var": 0.00029401381810506186, "learning_rate": 0.0001, "loss": 1.4636, "loss/crossentropy": 2.605035424232483, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.21753311157226562, "step": 5041 }, { "epoch": 0.07528800424073645, "grad_norm": 0.2119140625, "grad_norm_var": 0.0004016717274983724, "learning_rate": 0.0001, "loss": 1.416, "loss/crossentropy": 2.5798336267471313, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.20114537328481674, "step": 5042 }, { "epoch": 0.07530293641135144, "grad_norm": 0.2333984375, "grad_norm_var": 0.00041214227676391604, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.7200103998184204, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.17911048978567123, "step": 5043 }, { "epoch": 0.07531786858196642, "grad_norm": 0.26171875, "grad_norm_var": 0.00041834115982055666, "learning_rate": 0.0001, "loss": 1.3823, "loss/crossentropy": 2.519296884536743, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.18305829912424088, "step": 5044 }, { "epoch": 0.0753328007525814, "grad_norm": 0.28515625, "grad_norm_var": 0.0004857341448465983, "learning_rate": 0.0001, "loss": 1.5343, "loss/crossentropy": 2.945596218109131, "loss/fcd": 1.3046875, "loss/idx": 9.5, "loss/logits": 0.2295796275138855, "step": 5045 }, { "epoch": 0.07534773292319638, "grad_norm": 0.27734375, "grad_norm_var": 0.0005053798357645671, "learning_rate": 0.0001, "loss": 1.4877, "loss/crossentropy": 2.613662362098694, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.21037355065345764, "step": 5046 }, { "epoch": 0.07536266509381136, "grad_norm": 0.25390625, "grad_norm_var": 0.000502010186513265, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.7278738021850586, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.19352957606315613, "step": 5047 }, { "epoch": 0.07537759726442635, "grad_norm": 0.255859375, "grad_norm_var": 0.00041716496149698894, "learning_rate": 0.0001, "loss": 1.5267, "loss/crossentropy": 2.5303350687026978, "loss/fcd": 1.30859375, "loss/idx": 9.5, "loss/logits": 0.21814490854740143, "step": 5048 }, { "epoch": 0.07539252943504132, "grad_norm": 0.25390625, "grad_norm_var": 0.0003731052080790202, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.6215769052505493, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.20057407766580582, "step": 5049 }, { "epoch": 0.07540746160565631, "grad_norm": 0.28125, "grad_norm_var": 0.0004109660784403483, "learning_rate": 0.0001, "loss": 1.54, "loss/crossentropy": 2.6737096309661865, "loss/fcd": 1.31640625, "loss/idx": 9.5, "loss/logits": 0.2235897108912468, "step": 5050 }, { "epoch": 0.07542239377627129, "grad_norm": 0.234375, "grad_norm_var": 0.00042336384455362956, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.577537178993225, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.16717156022787094, "step": 5051 }, { "epoch": 0.07543732594688626, "grad_norm": 0.283203125, "grad_norm_var": 0.0004927913347880045, "learning_rate": 0.0001, "loss": 1.4737, "loss/crossentropy": 2.5035687685012817, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.21589966863393784, "step": 5052 }, { "epoch": 0.07545225811750125, "grad_norm": 0.2451171875, "grad_norm_var": 0.0004936059315999349, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.6787275075912476, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.1937669962644577, "step": 5053 }, { "epoch": 0.07546719028811623, "grad_norm": 0.251953125, "grad_norm_var": 0.0004894097646077473, "learning_rate": 0.0001, "loss": 1.5645, "loss/crossentropy": 2.4479689598083496, "loss/fcd": 1.33984375, "loss/idx": 9.5, "loss/logits": 0.224617600440979, "step": 5054 }, { "epoch": 0.07548212245873122, "grad_norm": 0.2578125, "grad_norm_var": 0.00040028889973958335, "learning_rate": 0.0001, "loss": 1.471, "loss/crossentropy": 2.6475915908813477, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.22486238926649094, "step": 5055 }, { "epoch": 0.0754970546293462, "grad_norm": 0.2373046875, "grad_norm_var": 0.00041004419326782224, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.6634167432785034, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.2113344892859459, "step": 5056 }, { "epoch": 0.07551198679996117, "grad_norm": 0.298828125, "grad_norm_var": 0.0005281925201416016, "learning_rate": 0.0001, "loss": 1.6823, "loss/crossentropy": 2.8065439462661743, "loss/fcd": 1.3984375, "loss/idx": 9.5, "loss/logits": 0.28385747969150543, "step": 5057 }, { "epoch": 0.07552691897057616, "grad_norm": 0.232421875, "grad_norm_var": 0.00042930841445922854, "learning_rate": 0.0001, "loss": 1.269, "loss/crossentropy": 2.384643793106079, "loss/fcd": 1.1171875, "loss/idx": 9.5, "loss/logits": 0.15180929005146027, "step": 5058 }, { "epoch": 0.07554185114119114, "grad_norm": 0.255859375, "grad_norm_var": 0.00038425127665201825, "learning_rate": 0.0001, "loss": 1.4403, "loss/crossentropy": 2.6302170753479004, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.1981019303202629, "step": 5059 }, { "epoch": 0.07555678331180612, "grad_norm": 0.2412109375, "grad_norm_var": 0.00040686527887980143, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.7237402200698853, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.18370267748832703, "step": 5060 }, { "epoch": 0.0755717154824211, "grad_norm": 0.220703125, "grad_norm_var": 0.00044253269831339516, "learning_rate": 0.0001, "loss": 1.485, "loss/crossentropy": 2.520729422569275, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.21544699370861053, "step": 5061 }, { "epoch": 0.07558664765303609, "grad_norm": 0.23828125, "grad_norm_var": 0.0004218697547912598, "learning_rate": 0.0001, "loss": 1.4433, "loss/crossentropy": 2.4671618938446045, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.20891348272562027, "step": 5062 }, { "epoch": 0.07560157982365107, "grad_norm": 0.21484375, "grad_norm_var": 0.000510561466217041, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.178975820541382, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.1864452362060547, "step": 5063 }, { "epoch": 0.07561651199426604, "grad_norm": 0.2451171875, "grad_norm_var": 0.0005096435546875, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.455274820327759, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.2005208283662796, "step": 5064 }, { "epoch": 0.07563144416488103, "grad_norm": 0.236328125, "grad_norm_var": 0.0005186557769775391, "learning_rate": 0.0001, "loss": 1.3817, "loss/crossentropy": 2.6611663103103638, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.18244870007038116, "step": 5065 }, { "epoch": 0.07564637633549601, "grad_norm": 0.2265625, "grad_norm_var": 0.0004661401112874349, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.8894442319869995, "loss/fcd": 1.16796875, "loss/idx": 9.5, "loss/logits": 0.1932310163974762, "step": 5066 }, { "epoch": 0.075661308506111, "grad_norm": 0.2216796875, "grad_norm_var": 0.0004941900571187337, "learning_rate": 0.0001, "loss": 1.2949, "loss/crossentropy": 2.5144734382629395, "loss/fcd": 1.13671875, "loss/idx": 9.5, "loss/logits": 0.15819182246923447, "step": 5067 }, { "epoch": 0.07567624067672597, "grad_norm": 0.228515625, "grad_norm_var": 0.000396724541982015, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.7911046743392944, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.16427671909332275, "step": 5068 }, { "epoch": 0.07569117284734095, "grad_norm": 0.234375, "grad_norm_var": 0.0003977298736572266, "learning_rate": 0.0001, "loss": 1.2468, "loss/crossentropy": 2.5882694721221924, "loss/fcd": 1.09375, "loss/idx": 9.5, "loss/logits": 0.15300536900758743, "step": 5069 }, { "epoch": 0.07570610501795594, "grad_norm": 0.2265625, "grad_norm_var": 0.0003979365030924479, "learning_rate": 0.0001, "loss": 1.3313, "loss/crossentropy": 2.6746450662612915, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.1828317567706108, "step": 5070 }, { "epoch": 0.07572103718857091, "grad_norm": 0.2392578125, "grad_norm_var": 0.00037173827489217124, "learning_rate": 0.0001, "loss": 1.4805, "loss/crossentropy": 2.7090624570846558, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.21876505762338638, "step": 5071 }, { "epoch": 0.0757359693591859, "grad_norm": 0.2080078125, "grad_norm_var": 0.00042562087376912436, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.901418685913086, "loss/fcd": 1.14453125, "loss/idx": 9.5, "loss/logits": 0.1716393455862999, "step": 5072 }, { "epoch": 0.07575090152980088, "grad_norm": 0.3125, "grad_norm_var": 0.0005526820818583171, "learning_rate": 0.0001, "loss": 1.6205, "loss/crossentropy": 2.5713216066360474, "loss/fcd": 1.3671875, "loss/idx": 9.5, "loss/logits": 0.25333552062511444, "step": 5073 }, { "epoch": 0.07576583370041585, "grad_norm": 0.25, "grad_norm_var": 0.0005626956621805827, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.3760851621627808, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.1862817034125328, "step": 5074 }, { "epoch": 0.07578076587103084, "grad_norm": 0.25390625, "grad_norm_var": 0.0005581498146057129, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.735738158226013, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.2127314805984497, "step": 5075 }, { "epoch": 0.07579569804164582, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005719780921936036, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.508322596549988, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.19713224470615387, "step": 5076 }, { "epoch": 0.07581063021226081, "grad_norm": 0.224609375, "grad_norm_var": 0.000564889113108317, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.6777249574661255, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.19930900633335114, "step": 5077 }, { "epoch": 0.07582556238287579, "grad_norm": 0.248046875, "grad_norm_var": 0.0005733132362365723, "learning_rate": 0.0001, "loss": 1.4195, "loss/crossentropy": 2.501622796058655, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.19687031209468842, "step": 5078 }, { "epoch": 0.07584049455349076, "grad_norm": 0.2578125, "grad_norm_var": 0.0005617737770080566, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.4919172525405884, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.1978168562054634, "step": 5079 }, { "epoch": 0.07585542672410575, "grad_norm": 0.244140625, "grad_norm_var": 0.0005611260732014974, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.544094681739807, "loss/fcd": 1.23046875, "loss/idx": 9.5, "loss/logits": 0.20323339849710464, "step": 5080 }, { "epoch": 0.07587035889472073, "grad_norm": 0.2470703125, "grad_norm_var": 0.0005636175473531087, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.2464324235916138, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.202776238322258, "step": 5081 }, { "epoch": 0.07588529106533572, "grad_norm": 0.2578125, "grad_norm_var": 0.0005674322446187337, "learning_rate": 0.0001, "loss": 1.5101, "loss/crossentropy": 2.591498851776123, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.21711519360542297, "step": 5082 }, { "epoch": 0.07590022323595069, "grad_norm": 0.298828125, "grad_norm_var": 0.00072784423828125, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 2.7192612886428833, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.19182515144348145, "step": 5083 }, { "epoch": 0.07591515540656568, "grad_norm": 0.2265625, "grad_norm_var": 0.0007329146067301432, "learning_rate": 0.0001, "loss": 1.3609, "loss/crossentropy": 2.7668250799179077, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.18508435785770416, "step": 5084 }, { "epoch": 0.07593008757718066, "grad_norm": 0.22265625, "grad_norm_var": 0.0007611433664957682, "learning_rate": 0.0001, "loss": 1.4732, "loss/crossentropy": 2.348276734352112, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.21540939062833786, "step": 5085 }, { "epoch": 0.07594501974779563, "grad_norm": 0.2451171875, "grad_norm_var": 0.0007340391476949056, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.4238357543945312, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.2201375663280487, "step": 5086 }, { "epoch": 0.07595995191841062, "grad_norm": 0.2216796875, "grad_norm_var": 0.0007723768552144369, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.6579710245132446, "loss/fcd": 1.11328125, "loss/idx": 9.5, "loss/logits": 0.16052207350730896, "step": 5087 }, { "epoch": 0.0759748840890256, "grad_norm": 0.22265625, "grad_norm_var": 0.0007110436757405599, "learning_rate": 0.0001, "loss": 1.4936, "loss/crossentropy": 2.333399772644043, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.21236715465784073, "step": 5088 }, { "epoch": 0.07598981625964059, "grad_norm": 0.2333984375, "grad_norm_var": 0.0004133184750874837, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.537507176399231, "loss/fcd": 1.1875, "loss/idx": 9.5, "loss/logits": 0.19044385850429535, "step": 5089 }, { "epoch": 0.07600474843025556, "grad_norm": 0.255859375, "grad_norm_var": 0.0004215200742085775, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.4454145431518555, "loss/fcd": 1.140625, "loss/idx": 9.5, "loss/logits": 0.15133393555879593, "step": 5090 }, { "epoch": 0.07601968060087054, "grad_norm": 0.265625, "grad_norm_var": 0.00044774611790974935, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.5193099975585938, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.19278177618980408, "step": 5091 }, { "epoch": 0.07603461277148553, "grad_norm": 0.3515625, "grad_norm_var": 0.001126861572265625, "learning_rate": 0.0001, "loss": 1.9854, "loss/crossentropy": 2.8576278686523438, "loss/fcd": 1.6953125, "loss/idx": 9.5, "loss/logits": 0.29009754955768585, "step": 5092 }, { "epoch": 0.0760495449421005, "grad_norm": 0.265625, "grad_norm_var": 0.0010851383209228515, "learning_rate": 0.0001, "loss": 1.6391, "loss/crossentropy": 2.3126262426376343, "loss/fcd": 1.38671875, "loss/idx": 9.5, "loss/logits": 0.2523748129606247, "step": 5093 }, { "epoch": 0.0760644771127155, "grad_norm": 0.2255859375, "grad_norm_var": 0.0011345823605855307, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.4873276948928833, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.17699521034955978, "step": 5094 }, { "epoch": 0.07607940928333047, "grad_norm": 0.2333984375, "grad_norm_var": 0.001154947280883789, "learning_rate": 0.0001, "loss": 1.4315, "loss/crossentropy": 2.5906184911727905, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.21664565801620483, "step": 5095 }, { "epoch": 0.07609434145394545, "grad_norm": 0.244140625, "grad_norm_var": 0.001154947280883789, "learning_rate": 0.0001, "loss": 1.2541, "loss/crossentropy": 2.805786609649658, "loss/fcd": 1.09765625, "loss/idx": 9.5, "loss/logits": 0.15645554661750793, "step": 5096 }, { "epoch": 0.07610927362456044, "grad_norm": 0.263671875, "grad_norm_var": 0.0011632561683654786, "learning_rate": 0.0001, "loss": 1.5122, "loss/crossentropy": 2.573467493057251, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.21926897019147873, "step": 5097 }, { "epoch": 0.07612420579517541, "grad_norm": 0.28125, "grad_norm_var": 0.0012153267860412597, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.6646405458450317, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.18691112101078033, "step": 5098 }, { "epoch": 0.0761391379657904, "grad_norm": 0.21484375, "grad_norm_var": 0.0011497139930725098, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.574557065963745, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.1661849319934845, "step": 5099 }, { "epoch": 0.07615407013640538, "grad_norm": 0.2314453125, "grad_norm_var": 0.0011370182037353516, "learning_rate": 0.0001, "loss": 1.3958, "loss/crossentropy": 2.6712297201156616, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.1965685337781906, "step": 5100 }, { "epoch": 0.07616900230702035, "grad_norm": 0.2314453125, "grad_norm_var": 0.0011113762855529785, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.6363991498947144, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.18822675943374634, "step": 5101 }, { "epoch": 0.07618393447763534, "grad_norm": 0.2353515625, "grad_norm_var": 0.0011226614316304526, "learning_rate": 0.0001, "loss": 1.5673, "loss/crossentropy": 2.4779120683670044, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.2392081916332245, "step": 5102 }, { "epoch": 0.07619886664825032, "grad_norm": 0.25, "grad_norm_var": 0.00107115109761556, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.7181471586227417, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.21212109923362732, "step": 5103 }, { "epoch": 0.07621379881886531, "grad_norm": 0.25390625, "grad_norm_var": 0.0010167280832926431, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.5279964208602905, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.20815738290548325, "step": 5104 }, { "epoch": 0.07622873098948028, "grad_norm": 0.234375, "grad_norm_var": 0.0010143240292867024, "learning_rate": 0.0001, "loss": 1.1984, "loss/crossentropy": 2.487188458442688, "loss/fcd": 1.056640625, "loss/idx": 9.5, "loss/logits": 0.1417834609746933, "step": 5105 }, { "epoch": 0.07624366316009527, "grad_norm": 0.236328125, "grad_norm_var": 0.0010291059811909993, "learning_rate": 0.0001, "loss": 1.5584, "loss/crossentropy": 2.4708369970321655, "loss/fcd": 1.33203125, "loss/idx": 9.5, "loss/logits": 0.2263539582490921, "step": 5106 }, { "epoch": 0.07625859533071025, "grad_norm": 0.255859375, "grad_norm_var": 0.001016231377919515, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.7216286659240723, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.19450105726718903, "step": 5107 }, { "epoch": 0.07627352750132523, "grad_norm": 0.2470703125, "grad_norm_var": 0.00029129981994628905, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.6633700132369995, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.22311286628246307, "step": 5108 }, { "epoch": 0.07628845967194021, "grad_norm": 0.234375, "grad_norm_var": 0.0002623081207275391, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.555517792701721, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.1770630031824112, "step": 5109 }, { "epoch": 0.07630339184255519, "grad_norm": 0.2734375, "grad_norm_var": 0.0003002762794494629, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.8078391551971436, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.18448375165462494, "step": 5110 }, { "epoch": 0.07631832401317018, "grad_norm": 0.2431640625, "grad_norm_var": 0.00029105742772420247, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.7696194648742676, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.20628274977207184, "step": 5111 }, { "epoch": 0.07633325618378516, "grad_norm": 0.251953125, "grad_norm_var": 0.00029328266779581703, "learning_rate": 0.0001, "loss": 1.5322, "loss/crossentropy": 2.599382162094116, "loss/fcd": 1.30078125, "loss/idx": 9.5, "loss/logits": 0.23142912983894348, "step": 5112 }, { "epoch": 0.07634818835440013, "grad_norm": 0.283203125, "grad_norm_var": 0.0003627419471740723, "learning_rate": 0.0001, "loss": 1.5713, "loss/crossentropy": 2.4915467500686646, "loss/fcd": 1.34375, "loss/idx": 9.5, "loss/logits": 0.22750063985586166, "step": 5113 }, { "epoch": 0.07636312052501512, "grad_norm": 0.23046875, "grad_norm_var": 0.0002945542335510254, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.6941221952438354, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.1819169893860817, "step": 5114 }, { "epoch": 0.0763780526956301, "grad_norm": 0.25390625, "grad_norm_var": 0.0002370158831278483, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.608971357345581, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.21423009037971497, "step": 5115 }, { "epoch": 0.07639298486624509, "grad_norm": 0.23828125, "grad_norm_var": 0.00022608439127604166, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.486601948738098, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.18729091435670853, "step": 5116 }, { "epoch": 0.07640791703686006, "grad_norm": 0.30078125, "grad_norm_var": 0.00038210153579711916, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.727733612060547, "loss/fcd": 1.28515625, "loss/idx": 9.5, "loss/logits": 0.20636918395757675, "step": 5117 }, { "epoch": 0.07642284920747504, "grad_norm": 0.2353515625, "grad_norm_var": 0.00038210153579711916, "learning_rate": 0.0001, "loss": 1.2282, "loss/crossentropy": 2.748258590698242, "loss/fcd": 1.06640625, "loss/idx": 9.5, "loss/logits": 0.16180330514907837, "step": 5118 }, { "epoch": 0.07643778137809003, "grad_norm": 0.2470703125, "grad_norm_var": 0.00038318634033203126, "learning_rate": 0.0001, "loss": 1.611, "loss/crossentropy": 2.573309302330017, "loss/fcd": 1.359375, "loss/idx": 9.5, "loss/logits": 0.2515895366668701, "step": 5119 }, { "epoch": 0.076452713548705, "grad_norm": 0.27734375, "grad_norm_var": 0.00042591094970703127, "learning_rate": 0.0001, "loss": 1.4564, "loss/crossentropy": 2.566595673561096, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.19072673469781876, "step": 5120 }, { "epoch": 0.07646764571932, "grad_norm": 0.2451171875, "grad_norm_var": 0.00040689706802368166, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.376437783241272, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.20263876765966415, "step": 5121 }, { "epoch": 0.07648257788993497, "grad_norm": 0.2412109375, "grad_norm_var": 0.00039730072021484377, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.6733362674713135, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.18687717616558075, "step": 5122 }, { "epoch": 0.07649751006054996, "grad_norm": 0.267578125, "grad_norm_var": 0.0004093170166015625, "learning_rate": 0.0001, "loss": 1.529, "loss/crossentropy": 2.5709919929504395, "loss/fcd": 1.32421875, "loss/idx": 9.5, "loss/logits": 0.20482853055000305, "step": 5123 }, { "epoch": 0.07651244223116493, "grad_norm": 0.2119140625, "grad_norm_var": 0.0005208969116210938, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.7623976469039917, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.17468653619289398, "step": 5124 }, { "epoch": 0.07652737440177991, "grad_norm": 0.240234375, "grad_norm_var": 0.0005091190338134766, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.686112880706787, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.21011222153902054, "step": 5125 }, { "epoch": 0.0765423065723949, "grad_norm": 0.20703125, "grad_norm_var": 0.0005999088287353515, "learning_rate": 0.0001, "loss": 1.4299, "loss/crossentropy": 2.3775455951690674, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.20335911214351654, "step": 5126 }, { "epoch": 0.07655723874300988, "grad_norm": 0.27734375, "grad_norm_var": 0.0006490031878153483, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 2.485181212425232, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.18322619795799255, "step": 5127 }, { "epoch": 0.07657217091362487, "grad_norm": 0.2451171875, "grad_norm_var": 0.0006506443023681641, "learning_rate": 0.0001, "loss": 1.611, "loss/crossentropy": 2.3444716930389404, "loss/fcd": 1.3671875, "loss/idx": 9.5, "loss/logits": 0.243795707821846, "step": 5128 }, { "epoch": 0.07658710308423984, "grad_norm": 0.251953125, "grad_norm_var": 0.0005738417307535808, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.571563959121704, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.1853732317686081, "step": 5129 }, { "epoch": 0.07660203525485482, "grad_norm": 0.26953125, "grad_norm_var": 0.0005770206451416015, "learning_rate": 0.0001, "loss": 1.5729, "loss/crossentropy": 2.7768527269363403, "loss/fcd": 1.34375, "loss/idx": 9.5, "loss/logits": 0.229185089468956, "step": 5130 }, { "epoch": 0.0766169674254698, "grad_norm": 0.267578125, "grad_norm_var": 0.0005947113037109375, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.6883628368377686, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.2067563310265541, "step": 5131 }, { "epoch": 0.07663189959608478, "grad_norm": 0.236328125, "grad_norm_var": 0.0005983829498291016, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.728757619857788, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.16593027114868164, "step": 5132 }, { "epoch": 0.07664683176669977, "grad_norm": 0.240234375, "grad_norm_var": 0.0004283905029296875, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.5457849502563477, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.16620613634586334, "step": 5133 }, { "epoch": 0.07666176393731475, "grad_norm": 0.2431640625, "grad_norm_var": 0.0004194895426432292, "learning_rate": 0.0001, "loss": 1.3232, "loss/crossentropy": 2.5914580821990967, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.1708204373717308, "step": 5134 }, { "epoch": 0.07667669610792972, "grad_norm": 0.224609375, "grad_norm_var": 0.00045394500096639, "learning_rate": 0.0001, "loss": 1.4846, "loss/crossentropy": 2.4769705533981323, "loss/fcd": 1.26953125, "loss/idx": 9.5, "loss/logits": 0.21511352062225342, "step": 5135 }, { "epoch": 0.07669162827854471, "grad_norm": 0.21484375, "grad_norm_var": 0.0004422465960184733, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.568962812423706, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.17174885421991348, "step": 5136 }, { "epoch": 0.07670656044915969, "grad_norm": 0.2255859375, "grad_norm_var": 0.00045988957087198895, "learning_rate": 0.0001, "loss": 1.3339, "loss/crossentropy": 2.6244195699691772, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.1815110668540001, "step": 5137 }, { "epoch": 0.07672149261977468, "grad_norm": 0.2197265625, "grad_norm_var": 0.0004896124203999838, "learning_rate": 0.0001, "loss": 1.3537, "loss/crossentropy": 2.574183940887451, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.18964031338691711, "step": 5138 }, { "epoch": 0.07673642479038965, "grad_norm": 0.20703125, "grad_norm_var": 0.0004974961280822754, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.6220799684524536, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.1960662305355072, "step": 5139 }, { "epoch": 0.07675135696100463, "grad_norm": 0.23046875, "grad_norm_var": 0.0004584630330403646, "learning_rate": 0.0001, "loss": 1.3911, "loss/crossentropy": 2.4857165813446045, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.18795308470726013, "step": 5140 }, { "epoch": 0.07676628913161962, "grad_norm": 0.294921875, "grad_norm_var": 0.0006649653116861979, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.8819305896759033, "loss/fcd": 1.3046875, "loss/idx": 9.5, "loss/logits": 0.2046537548303604, "step": 5141 }, { "epoch": 0.0767812213022346, "grad_norm": 0.255859375, "grad_norm_var": 0.0005930423736572266, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.2802118062973022, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.22586563229560852, "step": 5142 }, { "epoch": 0.07679615347284958, "grad_norm": 0.220703125, "grad_norm_var": 0.0005418777465820313, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.4039119482040405, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.17424248158931732, "step": 5143 }, { "epoch": 0.07681108564346456, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005617141723632812, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.287134051322937, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.15010139346122742, "step": 5144 }, { "epoch": 0.07682601781407955, "grad_norm": 0.2421875, "grad_norm_var": 0.0005508263905843099, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.7994461059570312, "loss/fcd": 1.1796875, "loss/idx": 9.5, "loss/logits": 0.21194376051425934, "step": 5145 }, { "epoch": 0.07684094998469453, "grad_norm": 0.26953125, "grad_norm_var": 0.0005508263905843099, "learning_rate": 0.0001, "loss": 1.5654, "loss/crossentropy": 2.5632572174072266, "loss/fcd": 1.33203125, "loss/idx": 9.5, "loss/logits": 0.2333315759897232, "step": 5146 }, { "epoch": 0.0768558821553095, "grad_norm": 0.240234375, "grad_norm_var": 0.000491189956665039, "learning_rate": 0.0001, "loss": 1.298, "loss/crossentropy": 2.6366525888442993, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.16904538869857788, "step": 5147 }, { "epoch": 0.07687081432592449, "grad_norm": 0.236328125, "grad_norm_var": 0.000491189956665039, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.484553813934326, "loss/fcd": 1.1796875, "loss/idx": 9.5, "loss/logits": 0.19972462952136993, "step": 5148 }, { "epoch": 0.07688574649653947, "grad_norm": 0.2333984375, "grad_norm_var": 0.000490883986155192, "learning_rate": 0.0001, "loss": 1.422, "loss/crossentropy": 2.659458875656128, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.1993173137307167, "step": 5149 }, { "epoch": 0.07690067866715446, "grad_norm": 0.236328125, "grad_norm_var": 0.000487518310546875, "learning_rate": 0.0001, "loss": 1.3814, "loss/crossentropy": 2.592710494995117, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.18613114953041077, "step": 5150 }, { "epoch": 0.07691561083776943, "grad_norm": 0.234375, "grad_norm_var": 0.00047885576883951824, "learning_rate": 0.0001, "loss": 1.4109, "loss/crossentropy": 2.2851481437683105, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.17652758210897446, "step": 5151 }, { "epoch": 0.07693054300838441, "grad_norm": 0.2431640625, "grad_norm_var": 0.00044739643732706705, "learning_rate": 0.0001, "loss": 1.5269, "loss/crossentropy": 2.6430004835128784, "loss/fcd": 1.30078125, "loss/idx": 9.5, "loss/logits": 0.22609710693359375, "step": 5152 }, { "epoch": 0.0769454751789994, "grad_norm": 0.2470703125, "grad_norm_var": 0.00044005314509073894, "learning_rate": 0.0001, "loss": 1.544, "loss/crossentropy": 2.741719961166382, "loss/fcd": 1.30078125, "loss/idx": 9.5, "loss/logits": 0.2432422935962677, "step": 5153 }, { "epoch": 0.07696040734961437, "grad_norm": 0.25, "grad_norm_var": 0.0004172643025716146, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.556643486022949, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.19466085731983185, "step": 5154 }, { "epoch": 0.07697533952022936, "grad_norm": 0.26953125, "grad_norm_var": 0.0003745396931966146, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.6819896697998047, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.1829548329114914, "step": 5155 }, { "epoch": 0.07699027169084434, "grad_norm": 0.2490234375, "grad_norm_var": 0.00035921335220336916, "learning_rate": 0.0001, "loss": 1.3885, "loss/crossentropy": 2.537166714668274, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.19321545213460922, "step": 5156 }, { "epoch": 0.07700520386145931, "grad_norm": 0.2353515625, "grad_norm_var": 0.00019656817118326824, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.515978455543518, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.23488977551460266, "step": 5157 }, { "epoch": 0.0770201360320743, "grad_norm": 0.23828125, "grad_norm_var": 0.00018526713053385418, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.7309921979904175, "loss/fcd": 1.09765625, "loss/idx": 9.5, "loss/logits": 0.15446199476718903, "step": 5158 }, { "epoch": 0.07703506820268928, "grad_norm": 0.255859375, "grad_norm_var": 0.0001640955607096354, "learning_rate": 0.0001, "loss": 1.3009, "loss/crossentropy": 2.863895297050476, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.17585109919309616, "step": 5159 }, { "epoch": 0.07705000037330427, "grad_norm": 0.2734375, "grad_norm_var": 0.0001782059669494629, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.629032850265503, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.16598480939865112, "step": 5160 }, { "epoch": 0.07706493254391925, "grad_norm": 0.23828125, "grad_norm_var": 0.00018173456192016602, "learning_rate": 0.0001, "loss": 1.471, "loss/crossentropy": 2.6498223543167114, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.221027709543705, "step": 5161 }, { "epoch": 0.07707986471453422, "grad_norm": 0.337890625, "grad_norm_var": 0.0006801883379618326, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.3581910133361816, "loss/fcd": 1.31640625, "loss/idx": 9.5, "loss/logits": 0.22606557607650757, "step": 5162 }, { "epoch": 0.07709479688514921, "grad_norm": 0.2294921875, "grad_norm_var": 0.0007030487060546875, "learning_rate": 0.0001, "loss": 1.272, "loss/crossentropy": 2.668095827102661, "loss/fcd": 1.103515625, "loss/idx": 9.5, "loss/logits": 0.16845349967479706, "step": 5163 }, { "epoch": 0.07710972905576419, "grad_norm": 0.267578125, "grad_norm_var": 0.0007050832112630209, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.430495262145996, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.19576995819807053, "step": 5164 }, { "epoch": 0.07712466122637918, "grad_norm": 0.251953125, "grad_norm_var": 0.000679488976796468, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.495182156562805, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.18673175573349, "step": 5165 }, { "epoch": 0.07713959339699415, "grad_norm": 0.265625, "grad_norm_var": 0.0006656606992085774, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.6310843229293823, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.2036619633436203, "step": 5166 }, { "epoch": 0.07715452556760914, "grad_norm": 0.24609375, "grad_norm_var": 0.000641342004140218, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.7076140642166138, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.218313567340374, "step": 5167 }, { "epoch": 0.07716945773822412, "grad_norm": 0.251953125, "grad_norm_var": 0.0006309350331624349, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.6705222129821777, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.2025127187371254, "step": 5168 }, { "epoch": 0.0771843899088391, "grad_norm": 0.263671875, "grad_norm_var": 0.0006268143653869629, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.3564603328704834, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.21141420304775238, "step": 5169 }, { "epoch": 0.07719932207945408, "grad_norm": 0.2490234375, "grad_norm_var": 0.0006278832753499349, "learning_rate": 0.0001, "loss": 1.5642, "loss/crossentropy": 2.420736074447632, "loss/fcd": 1.33203125, "loss/idx": 9.5, "loss/logits": 0.232121042907238, "step": 5170 }, { "epoch": 0.07721425425006906, "grad_norm": 0.25, "grad_norm_var": 0.000620889663696289, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.590607523918152, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.16385982930660248, "step": 5171 }, { "epoch": 0.07722918642068405, "grad_norm": 0.3125, "grad_norm_var": 0.0008096973101298015, "learning_rate": 0.0001, "loss": 1.4285, "loss/crossentropy": 2.7149993181228638, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.20584313571453094, "step": 5172 }, { "epoch": 0.07724411859129902, "grad_norm": 0.2060546875, "grad_norm_var": 0.0009613315264383952, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.413415789604187, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.1706022098660469, "step": 5173 }, { "epoch": 0.077259050761914, "grad_norm": 0.326171875, "grad_norm_var": 0.0012059489885965983, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.6211713552474976, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.19270552694797516, "step": 5174 }, { "epoch": 0.07727398293252899, "grad_norm": 0.23046875, "grad_norm_var": 0.0012741367022196452, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.7654436826705933, "loss/fcd": 1.1875, "loss/idx": 9.5, "loss/logits": 0.1891797035932541, "step": 5175 }, { "epoch": 0.07728891510314397, "grad_norm": 0.255859375, "grad_norm_var": 0.0012678424517313638, "learning_rate": 0.0001, "loss": 1.6714, "loss/crossentropy": 2.6289682388305664, "loss/fcd": 1.36328125, "loss/idx": 9.5, "loss/logits": 0.308069184422493, "step": 5176 }, { "epoch": 0.07730384727375895, "grad_norm": 0.220703125, "grad_norm_var": 0.0013413707415262858, "learning_rate": 0.0001, "loss": 1.3732, "loss/crossentropy": 2.5618470907211304, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.18962667137384415, "step": 5177 }, { "epoch": 0.07731877944437393, "grad_norm": 0.2490234375, "grad_norm_var": 0.0009157657623291016, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.6250041723251343, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.20015941560268402, "step": 5178 }, { "epoch": 0.0773337116149889, "grad_norm": 0.2314453125, "grad_norm_var": 0.000909423828125, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.646431803703308, "loss/fcd": 1.13671875, "loss/idx": 9.5, "loss/logits": 0.1808471530675888, "step": 5179 }, { "epoch": 0.0773486437856039, "grad_norm": 0.2451171875, "grad_norm_var": 0.0009029348691304525, "learning_rate": 0.0001, "loss": 1.5534, "loss/crossentropy": 2.735269069671631, "loss/fcd": 1.32421875, "loss/idx": 9.5, "loss/logits": 0.22916863858699799, "step": 5180 }, { "epoch": 0.07736357595621887, "grad_norm": 0.2177734375, "grad_norm_var": 0.0009829044342041016, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.4865704774856567, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.1841607317328453, "step": 5181 }, { "epoch": 0.07737850812683386, "grad_norm": 0.228515625, "grad_norm_var": 0.0009983062744140625, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.7411105632781982, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.20953258872032166, "step": 5182 }, { "epoch": 0.07739344029744884, "grad_norm": 0.212890625, "grad_norm_var": 0.0010801792144775391, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.581477165222168, "loss/fcd": 1.2109375, "loss/idx": 9.5, "loss/logits": 0.20427899062633514, "step": 5183 }, { "epoch": 0.07740837246806383, "grad_norm": 0.287109375, "grad_norm_var": 0.001180887222290039, "learning_rate": 0.0001, "loss": 1.611, "loss/crossentropy": 2.2612991333007812, "loss/fcd": 1.37890625, "loss/idx": 9.5, "loss/logits": 0.23208874464035034, "step": 5184 }, { "epoch": 0.0774233046386788, "grad_norm": 0.22265625, "grad_norm_var": 0.0012065887451171875, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.627973794937134, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.22227533161640167, "step": 5185 }, { "epoch": 0.07743823680929378, "grad_norm": 0.23046875, "grad_norm_var": 0.0012220660845438638, "learning_rate": 0.0001, "loss": 1.3918, "loss/crossentropy": 2.7652087211608887, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.18868913501501083, "step": 5186 }, { "epoch": 0.07745316897990877, "grad_norm": 0.26953125, "grad_norm_var": 0.0012578288714090982, "learning_rate": 0.0001, "loss": 1.4342, "loss/crossentropy": 2.5124751329421997, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.19980201870203018, "step": 5187 }, { "epoch": 0.07746810115052374, "grad_norm": 0.216796875, "grad_norm_var": 0.0009899099667867025, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.866755247116089, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.18206708133220673, "step": 5188 }, { "epoch": 0.07748303332113873, "grad_norm": 0.2392578125, "grad_norm_var": 0.0009056051572163899, "learning_rate": 0.0001, "loss": 1.2308, "loss/crossentropy": 2.764745831489563, "loss/fcd": 1.0859375, "loss/idx": 9.5, "loss/logits": 0.14481684565544128, "step": 5189 }, { "epoch": 0.07749796549175371, "grad_norm": 0.267578125, "grad_norm_var": 0.0004683454831441244, "learning_rate": 0.0001, "loss": 1.5219, "loss/crossentropy": 2.5990753173828125, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.20942038297653198, "step": 5190 }, { "epoch": 0.07751289766236868, "grad_norm": 0.2197265625, "grad_norm_var": 0.0004878838857014974, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.8457467555999756, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.19147010892629623, "step": 5191 }, { "epoch": 0.07752782983298367, "grad_norm": 0.232421875, "grad_norm_var": 0.0004676659901936849, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.5665873289108276, "loss/fcd": 1.14453125, "loss/idx": 9.5, "loss/logits": 0.1674044206738472, "step": 5192 }, { "epoch": 0.07754276200359865, "grad_norm": 0.2314453125, "grad_norm_var": 0.00045162439346313477, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.5039111375808716, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.206502765417099, "step": 5193 }, { "epoch": 0.07755769417421364, "grad_norm": 0.251953125, "grad_norm_var": 0.0004566192626953125, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.348996162414551, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.2173483446240425, "step": 5194 }, { "epoch": 0.07757262634482862, "grad_norm": 0.232421875, "grad_norm_var": 0.0004558523495992025, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.7123825550079346, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.19279754161834717, "step": 5195 }, { "epoch": 0.07758755851544359, "grad_norm": 0.263671875, "grad_norm_var": 0.0004953384399414062, "learning_rate": 0.0001, "loss": 1.5361, "loss/crossentropy": 2.5569597482681274, "loss/fcd": 1.296875, "loss/idx": 9.5, "loss/logits": 0.23926109075546265, "step": 5196 }, { "epoch": 0.07760249068605858, "grad_norm": 0.228515625, "grad_norm_var": 0.00047212839126586914, "learning_rate": 0.0001, "loss": 1.3257, "loss/crossentropy": 2.7651865482330322, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.1772148683667183, "step": 5197 }, { "epoch": 0.07761742285667356, "grad_norm": 0.2275390625, "grad_norm_var": 0.00047364234924316404, "learning_rate": 0.0001, "loss": 1.3445, "loss/crossentropy": 2.5555135011672974, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.1843380481004715, "step": 5198 }, { "epoch": 0.07763235502728855, "grad_norm": 0.2080078125, "grad_norm_var": 0.0004925370216369629, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.4643518924713135, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.18719825893640518, "step": 5199 }, { "epoch": 0.07764728719790352, "grad_norm": 0.248046875, "grad_norm_var": 0.0003389954566955566, "learning_rate": 0.0001, "loss": 1.4551, "loss/crossentropy": 2.3251813650131226, "loss/fcd": 1.26171875, "loss/idx": 9.5, "loss/logits": 0.19335834681987762, "step": 5200 }, { "epoch": 0.0776622193685185, "grad_norm": 0.26171875, "grad_norm_var": 0.00036029418309529624, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.7052289247512817, "loss/fcd": 1.234375, "loss/idx": 9.5, "loss/logits": 0.21229995787143707, "step": 5201 }, { "epoch": 0.07767715153913349, "grad_norm": 0.255859375, "grad_norm_var": 0.00037062565485636393, "learning_rate": 0.0001, "loss": 1.373, "loss/crossentropy": 2.6239166259765625, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.1815449297428131, "step": 5202 }, { "epoch": 0.07769208370974846, "grad_norm": 0.236328125, "grad_norm_var": 0.000312801202138265, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.5234707593917847, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.22987370938062668, "step": 5203 }, { "epoch": 0.07770701588036345, "grad_norm": 0.20703125, "grad_norm_var": 0.00034745136896769204, "learning_rate": 0.0001, "loss": 1.2513, "loss/crossentropy": 2.363078474998474, "loss/fcd": 1.09375, "loss/idx": 9.5, "loss/logits": 0.15753524750471115, "step": 5204 }, { "epoch": 0.07772194805097843, "grad_norm": 0.255859375, "grad_norm_var": 0.000366973876953125, "learning_rate": 0.0001, "loss": 1.4749, "loss/crossentropy": 2.510827422142029, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.2171265035867691, "step": 5205 }, { "epoch": 0.07773688022159342, "grad_norm": 0.26953125, "grad_norm_var": 0.0003745873769124349, "learning_rate": 0.0001, "loss": 1.4811, "loss/crossentropy": 2.823817253112793, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.20374196767807007, "step": 5206 }, { "epoch": 0.0777518123922084, "grad_norm": 0.31640625, "grad_norm_var": 0.0007054289182027181, "learning_rate": 0.0001, "loss": 1.5304, "loss/crossentropy": 2.80612051486969, "loss/fcd": 1.32421875, "loss/idx": 9.5, "loss/logits": 0.20615637302398682, "step": 5207 }, { "epoch": 0.07776674456282337, "grad_norm": 0.22265625, "grad_norm_var": 0.0007283171017964681, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.4733203649520874, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.19514798372983932, "step": 5208 }, { "epoch": 0.07778167673343836, "grad_norm": 0.25, "grad_norm_var": 0.0007167657216389974, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.7578741312026978, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.2007293999195099, "step": 5209 }, { "epoch": 0.07779660890405334, "grad_norm": 0.2373046875, "grad_norm_var": 0.0007184942563374838, "learning_rate": 0.0001, "loss": 1.3464, "loss/crossentropy": 2.736752152442932, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.16280636191368103, "step": 5210 }, { "epoch": 0.07781154107466833, "grad_norm": 0.2333984375, "grad_norm_var": 0.0007169087727864583, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.4676185846328735, "loss/fcd": 1.08203125, "loss/idx": 9.5, "loss/logits": 0.14866498112678528, "step": 5211 }, { "epoch": 0.0778264732452833, "grad_norm": 0.25390625, "grad_norm_var": 0.000698709487915039, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.53171968460083, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.19687501341104507, "step": 5212 }, { "epoch": 0.07784140541589828, "grad_norm": 0.2265625, "grad_norm_var": 0.0007031122843424479, "learning_rate": 0.0001, "loss": 1.3457, "loss/crossentropy": 2.545814871788025, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.17383848130702972, "step": 5213 }, { "epoch": 0.07785633758651327, "grad_norm": 0.298828125, "grad_norm_var": 0.0008606235186258952, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.4634199142456055, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.19401024281978607, "step": 5214 }, { "epoch": 0.07787126975712824, "grad_norm": 0.228515625, "grad_norm_var": 0.0007752577463785807, "learning_rate": 0.0001, "loss": 1.3273, "loss/crossentropy": 2.7041878700256348, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.17497096955776215, "step": 5215 }, { "epoch": 0.07788620192774323, "grad_norm": 0.212890625, "grad_norm_var": 0.0008622328440348307, "learning_rate": 0.0001, "loss": 1.2691, "loss/crossentropy": 2.71786892414093, "loss/fcd": 1.1015625, "loss/idx": 9.5, "loss/logits": 0.16754397749900818, "step": 5216 }, { "epoch": 0.07790113409835821, "grad_norm": 0.25390625, "grad_norm_var": 0.0008516788482666015, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.536847472190857, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.17775063961744308, "step": 5217 }, { "epoch": 0.07791606626897318, "grad_norm": 0.283203125, "grad_norm_var": 0.0009291172027587891, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.441604971885681, "loss/fcd": 1.28125, "loss/idx": 9.5, "loss/logits": 0.1962539628148079, "step": 5218 }, { "epoch": 0.07793099843958817, "grad_norm": 0.296875, "grad_norm_var": 0.0010547637939453125, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.68950092792511, "loss/fcd": 1.36328125, "loss/idx": 9.5, "loss/logits": 0.23139263689517975, "step": 5219 }, { "epoch": 0.07794593061020315, "grad_norm": 0.216796875, "grad_norm_var": 0.0010009606679280598, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.6801990270614624, "loss/fcd": 1.17578125, "loss/idx": 9.5, "loss/logits": 0.1890772208571434, "step": 5220 }, { "epoch": 0.07796086278081814, "grad_norm": 0.2138671875, "grad_norm_var": 0.001098183790842692, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.4718207120895386, "loss/fcd": 1.109375, "loss/idx": 9.5, "loss/logits": 0.15716806054115295, "step": 5221 }, { "epoch": 0.07797579495143311, "grad_norm": 0.47265625, "grad_norm_var": 0.004181094964345296, "learning_rate": 0.0001, "loss": 1.7727, "loss/crossentropy": 2.3185617923736572, "loss/fcd": 1.48828125, "loss/idx": 9.5, "loss/logits": 0.2844642475247383, "step": 5222 }, { "epoch": 0.07799072712204809, "grad_norm": 0.2294921875, "grad_norm_var": 0.004041401545206705, "learning_rate": 0.0001, "loss": 1.3634, "loss/crossentropy": 2.643775701522827, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.1797671839594841, "step": 5223 }, { "epoch": 0.07800565929266308, "grad_norm": 0.265625, "grad_norm_var": 0.003953282038370768, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.5379889011383057, "loss/fcd": 1.21875, "loss/idx": 9.5, "loss/logits": 0.2081250548362732, "step": 5224 }, { "epoch": 0.07802059146327806, "grad_norm": 0.306640625, "grad_norm_var": 0.004071744283040365, "learning_rate": 0.0001, "loss": 1.5963, "loss/crossentropy": 2.977737069129944, "loss/fcd": 1.3515625, "loss/idx": 9.5, "loss/logits": 0.24469277262687683, "step": 5225 }, { "epoch": 0.07803552363389304, "grad_norm": 0.2294921875, "grad_norm_var": 0.0041037877400716145, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.3959845304489136, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.15768348425626755, "step": 5226 }, { "epoch": 0.07805045580450802, "grad_norm": 0.341796875, "grad_norm_var": 0.004397102197011312, "learning_rate": 0.0001, "loss": 1.901, "loss/crossentropy": 2.915044665336609, "loss/fcd": 1.546875, "loss/idx": 9.5, "loss/logits": 0.3541547432541847, "step": 5227 }, { "epoch": 0.07806538797512301, "grad_norm": 0.29296875, "grad_norm_var": 0.0044050494829813635, "learning_rate": 0.0001, "loss": 1.6395, "loss/crossentropy": 2.4101873636245728, "loss/fcd": 1.3984375, "loss/idx": 9.5, "loss/logits": 0.2410811260342598, "step": 5228 }, { "epoch": 0.07808032014573799, "grad_norm": 0.271484375, "grad_norm_var": 0.004252239068349203, "learning_rate": 0.0001, "loss": 1.5875, "loss/crossentropy": 2.5188355445861816, "loss/fcd": 1.34765625, "loss/idx": 9.5, "loss/logits": 0.2398509979248047, "step": 5229 }, { "epoch": 0.07809525231635296, "grad_norm": 0.2197265625, "grad_norm_var": 0.0044019063313802086, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.51921284198761, "loss/fcd": 1.203125, "loss/idx": 9.5, "loss/logits": 0.1939656287431717, "step": 5230 }, { "epoch": 0.07811018448696795, "grad_norm": 0.3125, "grad_norm_var": 0.00436704953511556, "learning_rate": 0.0001, "loss": 1.4555, "loss/crossentropy": 2.53823721408844, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.213275745511055, "step": 5231 }, { "epoch": 0.07812511665758293, "grad_norm": 0.2421875, "grad_norm_var": 0.00417321523030599, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.4659221172332764, "loss/fcd": 1.1875, "loss/idx": 9.5, "loss/logits": 0.18246667832136154, "step": 5232 }, { "epoch": 0.07814004882819792, "grad_norm": 0.255859375, "grad_norm_var": 0.00416715939839681, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.7446285486221313, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.1893739551305771, "step": 5233 }, { "epoch": 0.07815498099881289, "grad_norm": 0.2373046875, "grad_norm_var": 0.004268197218577067, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 2.707065224647522, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.1695401966571808, "step": 5234 }, { "epoch": 0.07816991316942787, "grad_norm": 0.2255859375, "grad_norm_var": 0.004381036758422852, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.739125967025757, "loss/fcd": 1.25, "loss/idx": 9.5, "loss/logits": 0.20399294793605804, "step": 5235 }, { "epoch": 0.07818484534004286, "grad_norm": 0.2451171875, "grad_norm_var": 0.004226966698964437, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.512742757797241, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.2221674919128418, "step": 5236 }, { "epoch": 0.07819977751065783, "grad_norm": 0.287109375, "grad_norm_var": 0.003988250096638998, "learning_rate": 0.0001, "loss": 1.4836, "loss/crossentropy": 2.642538905143738, "loss/fcd": 1.2734375, "loss/idx": 9.5, "loss/logits": 0.2101661041378975, "step": 5237 }, { "epoch": 0.07821470968127282, "grad_norm": 0.2421875, "grad_norm_var": 0.0013024489084879558, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.683635115623474, "loss/fcd": 1.1875, "loss/idx": 9.5, "loss/logits": 0.1829146295785904, "step": 5238 }, { "epoch": 0.0782296418518878, "grad_norm": 0.365234375, "grad_norm_var": 0.0018509189287821452, "learning_rate": 0.0001, "loss": 1.5494, "loss/crossentropy": 3.015002131462097, "loss/fcd": 1.328125, "loss/idx": 9.5, "loss/logits": 0.2213006466627121, "step": 5239 }, { "epoch": 0.07824457402250277, "grad_norm": 0.291015625, "grad_norm_var": 0.0018719951311747232, "learning_rate": 0.0001, "loss": 1.5312, "loss/crossentropy": 2.730661392211914, "loss/fcd": 1.3125, "loss/idx": 9.5, "loss/logits": 0.21868795156478882, "step": 5240 }, { "epoch": 0.07825950619311776, "grad_norm": 0.2294921875, "grad_norm_var": 0.0018967946370442709, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.570380210876465, "loss/fcd": 1.125, "loss/idx": 9.5, "loss/logits": 0.16650037467479706, "step": 5241 }, { "epoch": 0.07827443836373274, "grad_norm": 0.29296875, "grad_norm_var": 0.0018221497535705566, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.62088406085968, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.19995557516813278, "step": 5242 }, { "epoch": 0.07828937053434773, "grad_norm": 0.2109375, "grad_norm_var": 0.0016751885414123535, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.735041379928589, "loss/fcd": 1.05078125, "loss/idx": 9.5, "loss/logits": 0.14418356865644455, "step": 5243 }, { "epoch": 0.0783043027049627, "grad_norm": 0.23046875, "grad_norm_var": 0.0016767144203186034, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.945288300514221, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.17977716028690338, "step": 5244 }, { "epoch": 0.0783192348755777, "grad_norm": 0.2890625, "grad_norm_var": 0.0017230629920959473, "learning_rate": 0.0001, "loss": 1.5341, "loss/crossentropy": 2.5576388835906982, "loss/fcd": 1.31640625, "loss/idx": 9.5, "loss/logits": 0.21770796179771423, "step": 5245 }, { "epoch": 0.07833416704619267, "grad_norm": 0.251953125, "grad_norm_var": 0.0016104221343994141, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.558943033218384, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.19347935169935226, "step": 5246 }, { "epoch": 0.07834909921680765, "grad_norm": 0.23828125, "grad_norm_var": 0.0014654636383056641, "learning_rate": 0.0001, "loss": 1.4785, "loss/crossentropy": 2.6159772872924805, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.22460514307022095, "step": 5247 }, { "epoch": 0.07836403138742264, "grad_norm": 0.2177734375, "grad_norm_var": 0.0015555659929911295, "learning_rate": 0.0001, "loss": 1.4793, "loss/crossentropy": 2.5065218210220337, "loss/fcd": 1.24609375, "loss/idx": 9.5, "loss/logits": 0.23319456726312637, "step": 5248 }, { "epoch": 0.07837896355803761, "grad_norm": 0.22265625, "grad_norm_var": 0.0016290624936421713, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.511572480201721, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.18748769164085388, "step": 5249 }, { "epoch": 0.0783938957286526, "grad_norm": 0.294921875, "grad_norm_var": 0.0017019748687744141, "learning_rate": 0.0001, "loss": 1.8983, "loss/crossentropy": 2.418375849723816, "loss/fcd": 1.609375, "loss/idx": 9.5, "loss/logits": 0.2889554649591446, "step": 5250 }, { "epoch": 0.07840882789926758, "grad_norm": 0.29296875, "grad_norm_var": 0.0016907334327697753, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.5771608352661133, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.1971866339445114, "step": 5251 }, { "epoch": 0.07842376006988255, "grad_norm": 0.271484375, "grad_norm_var": 0.0016726016998291015, "learning_rate": 0.0001, "loss": 1.5396, "loss/crossentropy": 2.5372835397720337, "loss/fcd": 1.3203125, "loss/idx": 9.5, "loss/logits": 0.21924323588609695, "step": 5252 }, { "epoch": 0.07843869224049754, "grad_norm": 0.419921875, "grad_norm_var": 0.0031792799631754558, "learning_rate": 0.0001, "loss": 1.7395, "loss/crossentropy": 2.466989576816559, "loss/fcd": 1.5390625, "loss/idx": 9.5, "loss/logits": 0.20039913058280945, "step": 5253 }, { "epoch": 0.07845362441111252, "grad_norm": 0.2421875, "grad_norm_var": 0.0031792799631754558, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.541210174560547, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.19218719005584717, "step": 5254 }, { "epoch": 0.07846855658172751, "grad_norm": 0.2578125, "grad_norm_var": 0.0025734583536783854, "learning_rate": 0.0001, "loss": 1.3369, "loss/crossentropy": 2.5238672494888306, "loss/fcd": 1.1640625, "loss/idx": 9.5, "loss/logits": 0.17279626429080963, "step": 5255 }, { "epoch": 0.07848348875234248, "grad_norm": 0.255859375, "grad_norm_var": 0.0025328318277994793, "learning_rate": 0.0001, "loss": 1.3651, "loss/crossentropy": 2.636286973953247, "loss/fcd": 1.18359375, "loss/idx": 9.5, "loss/logits": 0.18150195479393005, "step": 5256 }, { "epoch": 0.07849842092295746, "grad_norm": 0.248046875, "grad_norm_var": 0.002469789981842041, "learning_rate": 0.0001, "loss": 1.3581, "loss/crossentropy": 2.572114586830139, "loss/fcd": 1.1875, "loss/idx": 9.5, "loss/logits": 0.170571967959404, "step": 5257 }, { "epoch": 0.07851335309357245, "grad_norm": 0.265625, "grad_norm_var": 0.002413936456044515, "learning_rate": 0.0001, "loss": 1.3477, "loss/crossentropy": 2.4368863105773926, "loss/fcd": 1.1796875, "loss/idx": 9.5, "loss/logits": 0.16798392683267593, "step": 5258 }, { "epoch": 0.07852828526418743, "grad_norm": 0.2412109375, "grad_norm_var": 0.0022605737050374348, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.640411138534546, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.18331274390220642, "step": 5259 }, { "epoch": 0.07854321743480241, "grad_norm": 0.216796875, "grad_norm_var": 0.002335230509440104, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.4445735216140747, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.22110744565725327, "step": 5260 }, { "epoch": 0.07855814960541739, "grad_norm": 0.28515625, "grad_norm_var": 0.0023232142130533854, "learning_rate": 0.0001, "loss": 1.3824, "loss/crossentropy": 2.319474220275879, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.15971645712852478, "step": 5261 }, { "epoch": 0.07857308177603237, "grad_norm": 0.2265625, "grad_norm_var": 0.002404006322224935, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.784552574157715, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.1734277456998825, "step": 5262 }, { "epoch": 0.07858801394664736, "grad_norm": 0.2001953125, "grad_norm_var": 0.0026167829831441245, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.588140845298767, "loss/fcd": 1.14453125, "loss/idx": 9.5, "loss/logits": 0.17419981956481934, "step": 5263 }, { "epoch": 0.07860294611726233, "grad_norm": 0.283203125, "grad_norm_var": 0.0025164127349853516, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.462034225463867, "loss/fcd": 1.27734375, "loss/idx": 9.5, "loss/logits": 0.18792006373405457, "step": 5264 }, { "epoch": 0.07861787828787732, "grad_norm": 0.2158203125, "grad_norm_var": 0.0025570511817932127, "learning_rate": 0.0001, "loss": 1.3061, "loss/crossentropy": 2.5613977909088135, "loss/fcd": 1.13671875, "loss/idx": 9.5, "loss/logits": 0.16935274749994278, "step": 5265 }, { "epoch": 0.0786328104584923, "grad_norm": 0.271484375, "grad_norm_var": 0.0024935364723205566, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.4735918045043945, "loss/fcd": 1.29296875, "loss/idx": 9.5, "loss/logits": 0.22190575301647186, "step": 5266 }, { "epoch": 0.07864774262910729, "grad_norm": 0.28515625, "grad_norm_var": 0.0024652441342671713, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.398812770843506, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.19413380324840546, "step": 5267 }, { "epoch": 0.07866267479972226, "grad_norm": 0.23828125, "grad_norm_var": 0.0024906436602274576, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.53994357585907, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.18633929640054703, "step": 5268 }, { "epoch": 0.07867760697033724, "grad_norm": 0.2490234375, "grad_norm_var": 0.0006624698638916015, "learning_rate": 0.0001, "loss": 1.4443, "loss/crossentropy": 2.5857186317443848, "loss/fcd": 1.23828125, "loss/idx": 9.5, "loss/logits": 0.20603742450475693, "step": 5269 }, { "epoch": 0.07869253914095223, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006792982419331869, "learning_rate": 0.0001, "loss": 1.3491, "loss/crossentropy": 2.6984835863113403, "loss/fcd": 1.16796875, "loss/idx": 9.5, "loss/logits": 0.1811518743634224, "step": 5270 }, { "epoch": 0.0787074713115672, "grad_norm": 0.255859375, "grad_norm_var": 0.0006770412127176921, "learning_rate": 0.0001, "loss": 1.3886, "loss/crossentropy": 2.3890680074691772, "loss/fcd": 1.20703125, "loss/idx": 9.5, "loss/logits": 0.18155304342508316, "step": 5271 }, { "epoch": 0.0787224034821822, "grad_norm": 0.26953125, "grad_norm_var": 0.0007028539975484212, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.546673893928528, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.1795889437198639, "step": 5272 }, { "epoch": 0.07873733565279717, "grad_norm": 0.27734375, "grad_norm_var": 0.0007529218991597493, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.6166927814483643, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.18999746441841125, "step": 5273 }, { "epoch": 0.07875226782341214, "grad_norm": 0.2431640625, "grad_norm_var": 0.0007400353749593099, "learning_rate": 0.0001, "loss": 1.4386, "loss/crossentropy": 2.303736448287964, "loss/fcd": 1.2421875, "loss/idx": 9.5, "loss/logits": 0.19640373438596725, "step": 5274 }, { "epoch": 0.07876719999402713, "grad_norm": 0.2451171875, "grad_norm_var": 0.0007367293039957682, "learning_rate": 0.0001, "loss": 1.3331, "loss/crossentropy": 2.5346884727478027, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.18468940258026123, "step": 5275 }, { "epoch": 0.07878213216464211, "grad_norm": 0.2197265625, "grad_norm_var": 0.0007244388262430827, "learning_rate": 0.0001, "loss": 1.3391, "loss/crossentropy": 2.6236789226531982, "loss/fcd": 1.15625, "loss/idx": 9.5, "loss/logits": 0.18283085525035858, "step": 5276 }, { "epoch": 0.0787970643352571, "grad_norm": 0.234375, "grad_norm_var": 0.0006463328997294108, "learning_rate": 0.0001, "loss": 1.2723, "loss/crossentropy": 2.36770236492157, "loss/fcd": 1.1171875, "loss/idx": 9.5, "loss/logits": 0.1550832986831665, "step": 5277 }, { "epoch": 0.07881199650587208, "grad_norm": 0.2431640625, "grad_norm_var": 0.0006191094716389973, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.546992301940918, "loss/fcd": 1.08984375, "loss/idx": 9.5, "loss/logits": 0.1534321829676628, "step": 5278 }, { "epoch": 0.07882692867648705, "grad_norm": 0.23046875, "grad_norm_var": 0.00048471689224243163, "learning_rate": 0.0001, "loss": 1.3459, "loss/crossentropy": 2.6796528100967407, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.17398051172494888, "step": 5279 }, { "epoch": 0.07884186084710204, "grad_norm": 0.251953125, "grad_norm_var": 0.0004056255022684733, "learning_rate": 0.0001, "loss": 1.4046, "loss/crossentropy": 2.689160466194153, "loss/fcd": 1.19921875, "loss/idx": 9.5, "loss/logits": 0.2054290845990181, "step": 5280 }, { "epoch": 0.07885679301771702, "grad_norm": 0.37109375, "grad_norm_var": 0.0012541453043619791, "learning_rate": 0.0001, "loss": 1.7971, "loss/crossentropy": 2.661249876022339, "loss/fcd": 1.52734375, "loss/idx": 9.5, "loss/logits": 0.2697070389986038, "step": 5281 }, { "epoch": 0.078871725188332, "grad_norm": 0.296875, "grad_norm_var": 0.0013423760732014973, "learning_rate": 0.0001, "loss": 1.4869, "loss/crossentropy": 2.3622909784317017, "loss/fcd": 1.28515625, "loss/idx": 9.5, "loss/logits": 0.20175060629844666, "step": 5282 }, { "epoch": 0.07888665735894698, "grad_norm": 0.236328125, "grad_norm_var": 0.0013205210367838542, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.5023049116134644, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.1893680989742279, "step": 5283 }, { "epoch": 0.07890158952956196, "grad_norm": 0.275390625, "grad_norm_var": 0.0013196150461832683, "learning_rate": 0.0001, "loss": 1.327, "loss/crossentropy": 2.6777700185775757, "loss/fcd": 1.16015625, "loss/idx": 9.5, "loss/logits": 0.1668350100517273, "step": 5284 }, { "epoch": 0.07891652170017695, "grad_norm": 0.2138671875, "grad_norm_var": 0.0014397780100504558, "learning_rate": 0.0001, "loss": 1.3956, "loss/crossentropy": 2.5684096813201904, "loss/fcd": 1.1953125, "loss/idx": 9.5, "loss/logits": 0.20029211789369583, "step": 5285 }, { "epoch": 0.07893145387079192, "grad_norm": 0.248046875, "grad_norm_var": 0.0014026920000712077, "learning_rate": 0.0001, "loss": 1.4492, "loss/crossentropy": 2.5393121242523193, "loss/fcd": 1.25390625, "loss/idx": 9.5, "loss/logits": 0.1952626258134842, "step": 5286 }, { "epoch": 0.07894638604140691, "grad_norm": 0.248046875, "grad_norm_var": 0.0014077146848042806, "learning_rate": 0.0001, "loss": 1.3011, "loss/crossentropy": 2.245310425758362, "loss/fcd": 1.15234375, "loss/idx": 9.5, "loss/logits": 0.148767352104187, "step": 5287 }, { "epoch": 0.07896131821202189, "grad_norm": 0.2412109375, "grad_norm_var": 0.0014087518056233725, "learning_rate": 0.0001, "loss": 1.3, "loss/crossentropy": 2.549725890159607, "loss/fcd": 1.12890625, "loss/idx": 9.5, "loss/logits": 0.17113995552062988, "step": 5288 }, { "epoch": 0.07897625038263688, "grad_norm": 0.30078125, "grad_norm_var": 0.00151365598042806, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.4771581888198853, "loss/fcd": 1.265625, "loss/idx": 9.5, "loss/logits": 0.1931420937180519, "step": 5289 }, { "epoch": 0.07899118255325185, "grad_norm": 0.2294921875, "grad_norm_var": 0.0015491485595703126, "learning_rate": 0.0001, "loss": 1.3956, "loss/crossentropy": 2.6338335275650024, "loss/fcd": 1.19140625, "loss/idx": 9.5, "loss/logits": 0.20421184599399567, "step": 5290 }, { "epoch": 0.07900611472386683, "grad_norm": 0.2431640625, "grad_norm_var": 0.0015520572662353516, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.753323197364807, "loss/fcd": 1.22265625, "loss/idx": 9.5, "loss/logits": 0.198401041328907, "step": 5291 }, { "epoch": 0.07902104689448182, "grad_norm": 0.25390625, "grad_norm_var": 0.0014631867408752442, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.5091352462768555, "loss/fcd": 1.21484375, "loss/idx": 9.5, "loss/logits": 0.1828625202178955, "step": 5292 }, { "epoch": 0.0790359790650968, "grad_norm": 0.259765625, "grad_norm_var": 0.0014255801836649577, "learning_rate": 0.0001, "loss": 1.3536, "loss/crossentropy": 2.6134684085845947, "loss/fcd": 1.171875, "loss/idx": 9.5, "loss/logits": 0.1817653402686119, "step": 5293 }, { "epoch": 0.07905091123571178, "grad_norm": 0.39453125, "grad_norm_var": 0.0025385379791259765, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.792780876159668, "loss/fcd": 1.2578125, "loss/idx": 9.5, "loss/logits": 0.1667240411043167, "step": 5294 }, { "epoch": 0.07906584340632676, "grad_norm": 0.248046875, "grad_norm_var": 0.0024688720703125, "learning_rate": 0.0001, "loss": 1.3139, "loss/crossentropy": 2.4442254304885864, "loss/fcd": 1.1484375, "loss/idx": 9.5, "loss/logits": 0.16542242467403412, "step": 5295 }, { "epoch": 0.07908077557694174, "grad_norm": 0.287109375, "grad_norm_var": 0.0024637222290039063, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.933897018432617, "loss/fcd": 1.2265625, "loss/idx": 9.5, "loss/logits": 0.20716208219528198, "step": 5296 }, { "epoch": 0.07909570774755673, "grad_norm": 0.287109375, "grad_norm_var": 0.0017918745676676433, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.750832676887512, "loss/fcd": 1.1484375, "loss/idx": 9.75, "loss/logits": 0.17616190761327744, "step": 5297 }, { "epoch": 0.0791106399181717, "grad_norm": 0.671875, "grad_norm_var": 0.012100712458292643, "learning_rate": 0.0001, "loss": 1.5802, "loss/crossentropy": 2.71075701713562, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.1973956972360611, "step": 5298 }, { "epoch": 0.07912557208878669, "grad_norm": 0.61328125, "grad_norm_var": 0.018288167317708333, "learning_rate": 0.0001, "loss": 1.7664, "loss/crossentropy": 2.784268021583557, "loss/fcd": 1.5234375, "loss/idx": 10.0, "loss/logits": 0.24300340563058853, "step": 5299 }, { "epoch": 0.07914050425940167, "grad_norm": 0.52734375, "grad_norm_var": 0.02097624142964681, "learning_rate": 0.0001, "loss": 1.7253, "loss/crossentropy": 2.5237553119659424, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.24088122695684433, "step": 5300 }, { "epoch": 0.07915543643001664, "grad_norm": 0.482421875, "grad_norm_var": 0.021353240807851157, "learning_rate": 0.0001, "loss": 1.6082, "loss/crossentropy": 2.5229151248931885, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.21371044218540192, "step": 5301 }, { "epoch": 0.07917036860063163, "grad_norm": 0.482421875, "grad_norm_var": 0.021725173791249594, "learning_rate": 0.0001, "loss": 1.6686, "loss/crossentropy": 2.524850845336914, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.21936559677124023, "step": 5302 }, { "epoch": 0.07918530077124661, "grad_norm": 0.484375, "grad_norm_var": 0.021667476495107016, "learning_rate": 0.0001, "loss": 1.5193, "loss/crossentropy": 2.7809040546417236, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.18730450421571732, "step": 5303 }, { "epoch": 0.0792002329418616, "grad_norm": 0.56640625, "grad_norm_var": 0.022457440694173176, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.8887401819229126, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.19934014230966568, "step": 5304 }, { "epoch": 0.07921516511247657, "grad_norm": 0.43359375, "grad_norm_var": 0.02187811533610026, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.71807599067688, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19606418162584305, "step": 5305 }, { "epoch": 0.07923009728309156, "grad_norm": 0.4140625, "grad_norm_var": 0.019711426893870034, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.574560284614563, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.22491853684186935, "step": 5306 }, { "epoch": 0.07924502945370654, "grad_norm": 0.388671875, "grad_norm_var": 0.01768949826558431, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.7048345804214478, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.18285012245178223, "step": 5307 }, { "epoch": 0.07925996162432151, "grad_norm": 0.384765625, "grad_norm_var": 0.015780067443847655, "learning_rate": 0.0001, "loss": 1.4703, "loss/crossentropy": 2.7373528480529785, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1890011429786682, "step": 5308 }, { "epoch": 0.0792748937949365, "grad_norm": 0.486328125, "grad_norm_var": 0.013759295145670572, "learning_rate": 0.0001, "loss": 1.5855, "loss/crossentropy": 2.5739086866378784, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.19882231205701828, "step": 5309 }, { "epoch": 0.07928982596555148, "grad_norm": 0.400390625, "grad_norm_var": 0.013720432917277018, "learning_rate": 0.0001, "loss": 1.638, "loss/crossentropy": 2.5467922687530518, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.23960884660482407, "step": 5310 }, { "epoch": 0.07930475813616647, "grad_norm": 0.41015625, "grad_norm_var": 0.011054229736328126, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.60333788394928, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2035960853099823, "step": 5311 }, { "epoch": 0.07931969030678145, "grad_norm": 0.3828125, "grad_norm_var": 0.009452168146769207, "learning_rate": 0.0001, "loss": 1.5817, "loss/crossentropy": 2.7480242252349854, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21843475103378296, "step": 5312 }, { "epoch": 0.07933462247739642, "grad_norm": 0.41796875, "grad_norm_var": 0.00744476318359375, "learning_rate": 0.0001, "loss": 1.5958, "loss/crossentropy": 2.579663038253784, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22082997858524323, "step": 5313 }, { "epoch": 0.07934955464801141, "grad_norm": 0.38671875, "grad_norm_var": 0.004915301005045573, "learning_rate": 0.0001, "loss": 1.4413, "loss/crossentropy": 2.662824869155884, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1795828491449356, "step": 5314 }, { "epoch": 0.07936448681862639, "grad_norm": 0.439453125, "grad_norm_var": 0.003108835220336914, "learning_rate": 0.0001, "loss": 1.7391, "loss/crossentropy": 2.60833477973938, "loss/fcd": 1.5, "loss/idx": 10.0, "loss/logits": 0.23912712931632996, "step": 5315 }, { "epoch": 0.07937941898924138, "grad_norm": 0.455078125, "grad_norm_var": 0.0026224772135416668, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.7469338178634644, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2163892239332199, "step": 5316 }, { "epoch": 0.07939435115985635, "grad_norm": 0.4140625, "grad_norm_var": 0.002513996760050456, "learning_rate": 0.0001, "loss": 1.5877, "loss/crossentropy": 2.4822410345077515, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.19709594547748566, "step": 5317 }, { "epoch": 0.07940928333047133, "grad_norm": 0.38671875, "grad_norm_var": 0.002471160888671875, "learning_rate": 0.0001, "loss": 1.4086, "loss/crossentropy": 2.5513906478881836, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.16644436120986938, "step": 5318 }, { "epoch": 0.07942421550108632, "grad_norm": 0.470703125, "grad_norm_var": 0.002380482355753581, "learning_rate": 0.0001, "loss": 1.6758, "loss/crossentropy": 2.7287542819976807, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.23046663403511047, "step": 5319 }, { "epoch": 0.0794391476717013, "grad_norm": 0.376953125, "grad_norm_var": 0.001111602783203125, "learning_rate": 0.0001, "loss": 1.5122, "loss/crossentropy": 2.5577555894851685, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.1958172842860222, "step": 5320 }, { "epoch": 0.07945407984231628, "grad_norm": 0.396484375, "grad_norm_var": 0.0011082808176676431, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.897834062576294, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1821165755391121, "step": 5321 }, { "epoch": 0.07946901201293126, "grad_norm": 0.431640625, "grad_norm_var": 0.001129595438639323, "learning_rate": 0.0001, "loss": 1.6647, "loss/crossentropy": 2.7590456008911133, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.2232556864619255, "step": 5322 }, { "epoch": 0.07948394418354623, "grad_norm": 0.380859375, "grad_norm_var": 0.001160113016764323, "learning_rate": 0.0001, "loss": 1.5336, "loss/crossentropy": 2.616085171699524, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.1976960003376007, "step": 5323 }, { "epoch": 0.07949887635416122, "grad_norm": 0.37109375, "grad_norm_var": 0.0012247562408447266, "learning_rate": 0.0001, "loss": 1.5284, "loss/crossentropy": 2.4443020820617676, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.18460803478956223, "step": 5324 }, { "epoch": 0.0795138085247762, "grad_norm": 0.36328125, "grad_norm_var": 0.0009674072265625, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.669944167137146, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18831368535757065, "step": 5325 }, { "epoch": 0.07952874069539119, "grad_norm": 0.35546875, "grad_norm_var": 0.0011227766672770182, "learning_rate": 0.0001, "loss": 1.5143, "loss/crossentropy": 2.5904624462127686, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.21737664937973022, "step": 5326 }, { "epoch": 0.07954367286600617, "grad_norm": 0.38671875, "grad_norm_var": 0.0011330763498942056, "learning_rate": 0.0001, "loss": 1.4957, "loss/crossentropy": 2.6161423921585083, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1871470957994461, "step": 5327 }, { "epoch": 0.07955860503662116, "grad_norm": 0.3671875, "grad_norm_var": 0.001186227798461914, "learning_rate": 0.0001, "loss": 1.5545, "loss/crossentropy": 2.5778427124023438, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.20298391580581665, "step": 5328 }, { "epoch": 0.07957353720723613, "grad_norm": 0.56640625, "grad_norm_var": 0.0029184818267822266, "learning_rate": 0.0001, "loss": 1.5299, "loss/crossentropy": 2.699868321418762, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2134474664926529, "step": 5329 }, { "epoch": 0.0795884693778511, "grad_norm": 0.412109375, "grad_norm_var": 0.0028823216756184897, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.4048354625701904, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.21444624662399292, "step": 5330 }, { "epoch": 0.0796034015484661, "grad_norm": 0.369140625, "grad_norm_var": 0.0029235204060872396, "learning_rate": 0.0001, "loss": 1.4997, "loss/crossentropy": 2.4861119985580444, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.18722590059041977, "step": 5331 }, { "epoch": 0.07961833371908107, "grad_norm": 0.390625, "grad_norm_var": 0.0027656396230061847, "learning_rate": 0.0001, "loss": 1.6224, "loss/crossentropy": 2.323858380317688, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.22001232206821442, "step": 5332 }, { "epoch": 0.07963326588969606, "grad_norm": 0.353515625, "grad_norm_var": 0.002901140848795573, "learning_rate": 0.0001, "loss": 1.5427, "loss/crossentropy": 2.5837390422821045, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21459054201841354, "step": 5333 }, { "epoch": 0.07964819806031104, "grad_norm": 0.388671875, "grad_norm_var": 0.0028982639312744142, "learning_rate": 0.0001, "loss": 1.623, "loss/crossentropy": 2.4789175987243652, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.21287378668785095, "step": 5334 }, { "epoch": 0.07966313023092601, "grad_norm": 0.423828125, "grad_norm_var": 0.002586221694946289, "learning_rate": 0.0001, "loss": 1.7112, "loss/crossentropy": 2.512712240219116, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.23852790147066116, "step": 5335 }, { "epoch": 0.079678062401541, "grad_norm": 0.58984375, "grad_norm_var": 0.004881795247395833, "learning_rate": 0.0001, "loss": 1.7207, "loss/crossentropy": 2.4936182498931885, "loss/fcd": 1.49609375, "loss/idx": 10.0, "loss/logits": 0.22459716349840164, "step": 5336 }, { "epoch": 0.07969299457215598, "grad_norm": 0.40234375, "grad_norm_var": 0.004874022801717123, "learning_rate": 0.0001, "loss": 1.4187, "loss/crossentropy": 2.8007060289382935, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.16869431734085083, "step": 5337 }, { "epoch": 0.07970792674277097, "grad_norm": 0.390625, "grad_norm_var": 0.004858334859212239, "learning_rate": 0.0001, "loss": 1.484, "loss/crossentropy": 2.517879843711853, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18323994427919388, "step": 5338 }, { "epoch": 0.07972285891338594, "grad_norm": 0.3828125, "grad_norm_var": 0.004851770401000976, "learning_rate": 0.0001, "loss": 1.5768, "loss/crossentropy": 2.5954086780548096, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20958247780799866, "step": 5339 }, { "epoch": 0.07973779108400092, "grad_norm": 0.390625, "grad_norm_var": 0.004781834284464518, "learning_rate": 0.0001, "loss": 1.4921, "loss/crossentropy": 2.781269073486328, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19132938235998154, "step": 5340 }, { "epoch": 0.07975272325461591, "grad_norm": 0.400390625, "grad_norm_var": 0.004645029703776042, "learning_rate": 0.0001, "loss": 1.6009, "loss/crossentropy": 2.485008955001831, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.19853586703538895, "step": 5341 }, { "epoch": 0.07976765542523089, "grad_norm": 0.37109375, "grad_norm_var": 0.004545338948567708, "learning_rate": 0.0001, "loss": 1.4314, "loss/crossentropy": 2.7511422634124756, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18136631697416306, "step": 5342 }, { "epoch": 0.07978258759584587, "grad_norm": 0.361328125, "grad_norm_var": 0.004669936498006185, "learning_rate": 0.0001, "loss": 1.56, "loss/crossentropy": 2.5869064331054688, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2162734419107437, "step": 5343 }, { "epoch": 0.07979751976646085, "grad_norm": 0.369140625, "grad_norm_var": 0.004659016927083333, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.814134955406189, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19954922795295715, "step": 5344 }, { "epoch": 0.07981245193707583, "grad_norm": 0.37890625, "grad_norm_var": 0.0029500325520833335, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.73071026802063, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2161025106906891, "step": 5345 }, { "epoch": 0.07982738410769082, "grad_norm": 0.431640625, "grad_norm_var": 0.0030094782511393228, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.6170159578323364, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2055811583995819, "step": 5346 }, { "epoch": 0.07984231627830579, "grad_norm": 0.380859375, "grad_norm_var": 0.0029703776041666665, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.481483221054077, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.19383075833320618, "step": 5347 }, { "epoch": 0.07985724844892078, "grad_norm": 0.3671875, "grad_norm_var": 0.0030352274576822915, "learning_rate": 0.0001, "loss": 1.3995, "loss/crossentropy": 2.539896607398987, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.16907178610563278, "step": 5348 }, { "epoch": 0.07987218061953576, "grad_norm": 0.341796875, "grad_norm_var": 0.003114763895670573, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.5432785749435425, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.2018386572599411, "step": 5349 }, { "epoch": 0.07988711279015075, "grad_norm": 0.333984375, "grad_norm_var": 0.003371111551920573, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.5869914293289185, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17961485683918, "step": 5350 }, { "epoch": 0.07990204496076572, "grad_norm": 0.392578125, "grad_norm_var": 0.0033110936482747396, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.7634013891220093, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21344546973705292, "step": 5351 }, { "epoch": 0.0799169771313807, "grad_norm": 0.38671875, "grad_norm_var": 0.0005538304646809896, "learning_rate": 0.0001, "loss": 1.7688, "loss/crossentropy": 2.672701835632324, "loss/fcd": 1.50390625, "loss/idx": 10.0, "loss/logits": 0.26493407785892487, "step": 5352 }, { "epoch": 0.07993190930199569, "grad_norm": 0.54296875, "grad_norm_var": 0.0022063573201497396, "learning_rate": 0.0001, "loss": 1.6335, "loss/crossentropy": 2.317577838897705, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.19598495215177536, "step": 5353 }, { "epoch": 0.07994684147261066, "grad_norm": 0.333984375, "grad_norm_var": 0.002393960952758789, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.5829999446868896, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18584561347961426, "step": 5354 }, { "epoch": 0.07996177364322565, "grad_norm": 0.349609375, "grad_norm_var": 0.002474212646484375, "learning_rate": 0.0001, "loss": 1.6371, "loss/crossentropy": 2.475335478782654, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.23089351505041122, "step": 5355 }, { "epoch": 0.07997670581384063, "grad_norm": 0.5390625, "grad_norm_var": 0.00399627685546875, "learning_rate": 0.0001, "loss": 1.692, "loss/crossentropy": 2.25055992603302, "loss/fcd": 1.5, "loss/idx": 10.0, "loss/logits": 0.19201570749282837, "step": 5356 }, { "epoch": 0.0799916379844556, "grad_norm": 0.380859375, "grad_norm_var": 0.003999773661295573, "learning_rate": 0.0001, "loss": 1.653, "loss/crossentropy": 2.652812123298645, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2310793101787567, "step": 5357 }, { "epoch": 0.0800065701550706, "grad_norm": 0.359375, "grad_norm_var": 0.0040400187174479164, "learning_rate": 0.0001, "loss": 1.502, "loss/crossentropy": 2.7510465383529663, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19337748736143112, "step": 5358 }, { "epoch": 0.08002150232568557, "grad_norm": 0.357421875, "grad_norm_var": 0.004056231180826823, "learning_rate": 0.0001, "loss": 1.5238, "loss/crossentropy": 2.598837733268738, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21525149047374725, "step": 5359 }, { "epoch": 0.08003643449630056, "grad_norm": 0.3828125, "grad_norm_var": 0.004029194513956706, "learning_rate": 0.0001, "loss": 1.5092, "loss/crossentropy": 2.585978627204895, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2005893737077713, "step": 5360 }, { "epoch": 0.08005136666691554, "grad_norm": 0.376953125, "grad_norm_var": 0.004032643636067709, "learning_rate": 0.0001, "loss": 1.5504, "loss/crossentropy": 2.756770133972168, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22224575281143188, "step": 5361 }, { "epoch": 0.08006629883753051, "grad_norm": 0.34765625, "grad_norm_var": 0.0040196577707926435, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.768447518348694, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1765330284833908, "step": 5362 }, { "epoch": 0.0800812310081455, "grad_norm": 0.41015625, "grad_norm_var": 0.004053751627604167, "learning_rate": 0.0001, "loss": 1.5454, "loss/crossentropy": 2.7004882097244263, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19773508608341217, "step": 5363 }, { "epoch": 0.08009616317876048, "grad_norm": 0.369140625, "grad_norm_var": 0.004048649470011393, "learning_rate": 0.0001, "loss": 1.4228, "loss/crossentropy": 2.4343305826187134, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18056660890579224, "step": 5364 }, { "epoch": 0.08011109534937547, "grad_norm": 0.365234375, "grad_norm_var": 0.0039391676584879555, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.8048843145370483, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19340451061725616, "step": 5365 }, { "epoch": 0.08012602751999044, "grad_norm": 0.35546875, "grad_norm_var": 0.0038096110026041665, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.4736013412475586, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.20349542051553726, "step": 5366 }, { "epoch": 0.08014095969060543, "grad_norm": 0.5078125, "grad_norm_var": 0.004669555028279622, "learning_rate": 0.0001, "loss": 1.7708, "loss/crossentropy": 2.514909029006958, "loss/fcd": 1.54296875, "loss/idx": 10.0, "loss/logits": 0.22784429043531418, "step": 5367 }, { "epoch": 0.08015589186122041, "grad_norm": 0.416015625, "grad_norm_var": 0.0046798070271809895, "learning_rate": 0.0001, "loss": 1.6265, "loss/crossentropy": 2.367187023162842, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.224180705845356, "step": 5368 }, { "epoch": 0.08017082403183538, "grad_norm": 0.416015625, "grad_norm_var": 0.0032612959543863933, "learning_rate": 0.0001, "loss": 1.8407, "loss/crossentropy": 2.647940516471863, "loss/fcd": 1.56640625, "loss/idx": 10.0, "loss/logits": 0.27432645857334137, "step": 5369 }, { "epoch": 0.08018575620245037, "grad_norm": 0.400390625, "grad_norm_var": 0.003025674819946289, "learning_rate": 0.0001, "loss": 1.6981, "loss/crossentropy": 2.64590847492218, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.24502480030059814, "step": 5370 }, { "epoch": 0.08020068837306535, "grad_norm": 0.43359375, "grad_norm_var": 0.0029484430948893228, "learning_rate": 0.0001, "loss": 1.7073, "loss/crossentropy": 2.4826329946517944, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.25029950588941574, "step": 5371 }, { "epoch": 0.08021562054368034, "grad_norm": 0.380859375, "grad_norm_var": 0.0016030470530192058, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.4130120277404785, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19631223380565643, "step": 5372 }, { "epoch": 0.08023055271429531, "grad_norm": 0.388671875, "grad_norm_var": 0.0015960534413655598, "learning_rate": 0.0001, "loss": 1.7036, "loss/crossentropy": 2.643268585205078, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.25433240830898285, "step": 5373 }, { "epoch": 0.08024548488491029, "grad_norm": 0.4296875, "grad_norm_var": 0.0016017754872639973, "learning_rate": 0.0001, "loss": 1.6133, "loss/crossentropy": 2.5815831422805786, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2226807102560997, "step": 5374 }, { "epoch": 0.08026041705552528, "grad_norm": 0.3984375, "grad_norm_var": 0.0014952977498372396, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.661044120788574, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.1947060227394104, "step": 5375 }, { "epoch": 0.08027534922614026, "grad_norm": 0.373046875, "grad_norm_var": 0.001521921157836914, "learning_rate": 0.0001, "loss": 1.6244, "loss/crossentropy": 2.6279947757720947, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21817081421613693, "step": 5376 }, { "epoch": 0.08029028139675524, "grad_norm": 0.35546875, "grad_norm_var": 0.0016112645467122396, "learning_rate": 0.0001, "loss": 1.4985, "loss/crossentropy": 2.486128568649292, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19379210472106934, "step": 5377 }, { "epoch": 0.08030521356737022, "grad_norm": 0.34375, "grad_norm_var": 0.0016377766927083333, "learning_rate": 0.0001, "loss": 1.5741, "loss/crossentropy": 2.574691414833069, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.22258589416742325, "step": 5378 }, { "epoch": 0.0803201457379852, "grad_norm": 0.423828125, "grad_norm_var": 0.0016743818918863933, "learning_rate": 0.0001, "loss": 1.6617, "loss/crossentropy": 2.491631269454956, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.22028498351573944, "step": 5379 }, { "epoch": 0.08033507790860019, "grad_norm": 0.427734375, "grad_norm_var": 0.0016686598459879558, "learning_rate": 0.0001, "loss": 1.5402, "loss/crossentropy": 2.629339098930359, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20812370628118515, "step": 5380 }, { "epoch": 0.08035001007921516, "grad_norm": 0.3671875, "grad_norm_var": 0.0016595840454101563, "learning_rate": 0.0001, "loss": 1.5639, "loss/crossentropy": 2.6245683431625366, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21232934296131134, "step": 5381 }, { "epoch": 0.08036494224983015, "grad_norm": 0.357421875, "grad_norm_var": 0.00164793332417806, "learning_rate": 0.0001, "loss": 1.5464, "loss/crossentropy": 2.5468536615371704, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.19478848576545715, "step": 5382 }, { "epoch": 0.08037987442044513, "grad_norm": 0.34375, "grad_norm_var": 0.0009990533192952473, "learning_rate": 0.0001, "loss": 1.5201, "loss/crossentropy": 2.67034375667572, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20761334151029587, "step": 5383 }, { "epoch": 0.0803948065910601, "grad_norm": 0.33203125, "grad_norm_var": 0.00115966796875, "learning_rate": 0.0001, "loss": 1.4014, "loss/crossentropy": 2.7001959085464478, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17483387142419815, "step": 5384 }, { "epoch": 0.08040973876167509, "grad_norm": 0.412109375, "grad_norm_var": 0.001144854227701823, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.6639535427093506, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23027561604976654, "step": 5385 }, { "epoch": 0.08042467093229007, "grad_norm": 0.421875, "grad_norm_var": 0.0012163639068603516, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.491113305091858, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.16885056346654892, "step": 5386 }, { "epoch": 0.08043960310290506, "grad_norm": 0.37109375, "grad_norm_var": 0.0010708967844645183, "learning_rate": 0.0001, "loss": 1.5987, "loss/crossentropy": 2.527896285057068, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22372809797525406, "step": 5387 }, { "epoch": 0.08045453527352003, "grad_norm": 0.341796875, "grad_norm_var": 0.001177072525024414, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.3481361865997314, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19830501824617386, "step": 5388 }, { "epoch": 0.08046946744413502, "grad_norm": 0.43359375, "grad_norm_var": 0.0013521830240885416, "learning_rate": 0.0001, "loss": 1.7165, "loss/crossentropy": 2.3144047260284424, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.22426556050777435, "step": 5389 }, { "epoch": 0.08048439961475, "grad_norm": 0.375, "grad_norm_var": 0.00120086669921875, "learning_rate": 0.0001, "loss": 1.5114, "loss/crossentropy": 2.5703043937683105, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20279935747385025, "step": 5390 }, { "epoch": 0.08049933178536497, "grad_norm": 0.376953125, "grad_norm_var": 0.0011765638987223308, "learning_rate": 0.0001, "loss": 1.4759, "loss/crossentropy": 2.829824209213257, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20250166207551956, "step": 5391 }, { "epoch": 0.08051426395597996, "grad_norm": 0.328125, "grad_norm_var": 0.001335589090983073, "learning_rate": 0.0001, "loss": 1.6251, "loss/crossentropy": 2.64094614982605, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23842430114746094, "step": 5392 }, { "epoch": 0.08052919612659494, "grad_norm": 0.322265625, "grad_norm_var": 0.0014942010243733724, "learning_rate": 0.0001, "loss": 1.5596, "loss/crossentropy": 2.5731699466705322, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21583804488182068, "step": 5393 }, { "epoch": 0.08054412829720993, "grad_norm": 0.318359375, "grad_norm_var": 0.0016357421875, "learning_rate": 0.0001, "loss": 1.3994, "loss/crossentropy": 2.4378480911254883, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.16890642791986465, "step": 5394 }, { "epoch": 0.0805590604678249, "grad_norm": 0.380859375, "grad_norm_var": 0.001454607645670573, "learning_rate": 0.0001, "loss": 1.589, "loss/crossentropy": 2.8014683723449707, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22966697067022324, "step": 5395 }, { "epoch": 0.08057399263843988, "grad_norm": 0.3671875, "grad_norm_var": 0.0012126763661702473, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.6082457304000854, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20572249591350555, "step": 5396 }, { "epoch": 0.08058892480905487, "grad_norm": 0.453125, "grad_norm_var": 0.0016924381256103516, "learning_rate": 0.0001, "loss": 1.538, "loss/crossentropy": 2.6109979152679443, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20209342241287231, "step": 5397 }, { "epoch": 0.08060385697966985, "grad_norm": 0.376953125, "grad_norm_var": 0.0016809940338134766, "learning_rate": 0.0001, "loss": 1.6358, "loss/crossentropy": 2.4097461700439453, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.2178029641509056, "step": 5398 }, { "epoch": 0.08061878915028484, "grad_norm": 0.41015625, "grad_norm_var": 0.0017047723134358723, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.7110533714294434, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20672734826803207, "step": 5399 }, { "epoch": 0.08063372132089981, "grad_norm": 0.59765625, "grad_norm_var": 0.004545195897420248, "learning_rate": 0.0001, "loss": 1.7872, "loss/crossentropy": 2.659254550933838, "loss/fcd": 1.52734375, "loss/idx": 10.0, "loss/logits": 0.25982221961021423, "step": 5400 }, { "epoch": 0.08064865349151479, "grad_norm": 0.349609375, "grad_norm_var": 0.0046296278635660805, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.6570777893066406, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21152621507644653, "step": 5401 }, { "epoch": 0.08066358566212978, "grad_norm": 0.38671875, "grad_norm_var": 0.004552952448527018, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.6076509952545166, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1990063190460205, "step": 5402 }, { "epoch": 0.08067851783274475, "grad_norm": 0.33984375, "grad_norm_var": 0.0046796003977457685, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.557447075843811, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.191103957593441, "step": 5403 }, { "epoch": 0.08069345000335974, "grad_norm": 0.37109375, "grad_norm_var": 0.004564921061197917, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.681131958961487, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18222137540578842, "step": 5404 }, { "epoch": 0.08070838217397472, "grad_norm": 0.3515625, "grad_norm_var": 0.004472796122233073, "learning_rate": 0.0001, "loss": 1.4068, "loss/crossentropy": 2.646591067314148, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18808694928884506, "step": 5405 }, { "epoch": 0.0807233143445897, "grad_norm": 0.322265625, "grad_norm_var": 0.004692951838175456, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.6756720542907715, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1771066039800644, "step": 5406 }, { "epoch": 0.08073824651520468, "grad_norm": 0.3515625, "grad_norm_var": 0.00473779042561849, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.7970573902130127, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19158188998699188, "step": 5407 }, { "epoch": 0.08075317868581966, "grad_norm": 0.357421875, "grad_norm_var": 0.0046016534169514975, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.4185277223587036, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.16659975051879883, "step": 5408 }, { "epoch": 0.08076811085643465, "grad_norm": 0.3203125, "grad_norm_var": 0.004616546630859375, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.4619003534317017, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17691771686077118, "step": 5409 }, { "epoch": 0.08078304302704963, "grad_norm": 0.376953125, "grad_norm_var": 0.004361915588378906, "learning_rate": 0.0001, "loss": 1.557, "loss/crossentropy": 2.6186453104019165, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.1975952759385109, "step": 5410 }, { "epoch": 0.08079797519766461, "grad_norm": 0.34375, "grad_norm_var": 0.00445402463277181, "learning_rate": 0.0001, "loss": 1.4838, "loss/crossentropy": 2.6742258071899414, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20257382094860077, "step": 5411 }, { "epoch": 0.08081290736827959, "grad_norm": 0.35546875, "grad_norm_var": 0.004482253392537435, "learning_rate": 0.0001, "loss": 1.5265, "loss/crossentropy": 2.632409453392029, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21397248655557632, "step": 5412 }, { "epoch": 0.08082783953889457, "grad_norm": 0.34765625, "grad_norm_var": 0.004135497411092122, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.5869721174240112, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18099573254585266, "step": 5413 }, { "epoch": 0.08084277170950956, "grad_norm": 0.375, "grad_norm_var": 0.004134559631347656, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.7531182765960693, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22028660029172897, "step": 5414 }, { "epoch": 0.08085770388012453, "grad_norm": 0.408203125, "grad_norm_var": 0.004124943415323893, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.6735183000564575, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2186356633901596, "step": 5415 }, { "epoch": 0.08087263605073952, "grad_norm": 0.458984375, "grad_norm_var": 0.0011580785115559895, "learning_rate": 0.0001, "loss": 1.5223, "loss/crossentropy": 2.6192487478256226, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20194579660892487, "step": 5416 }, { "epoch": 0.0808875682213545, "grad_norm": 0.421875, "grad_norm_var": 0.00135038693745931, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 2.5121047496795654, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.21045339852571487, "step": 5417 }, { "epoch": 0.08090250039196947, "grad_norm": 0.35546875, "grad_norm_var": 0.00133360226949056, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5735288858413696, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.200991652905941, "step": 5418 }, { "epoch": 0.08091743256258446, "grad_norm": 0.423828125, "grad_norm_var": 0.001480547587076823, "learning_rate": 0.0001, "loss": 1.7011, "loss/crossentropy": 2.475821375846863, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.255778931081295, "step": 5419 }, { "epoch": 0.08093236473319944, "grad_norm": 0.4375, "grad_norm_var": 0.001753997802734375, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.1791625022888184, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.207158625125885, "step": 5420 }, { "epoch": 0.08094729690381443, "grad_norm": 0.419921875, "grad_norm_var": 0.0018279870351155599, "learning_rate": 0.0001, "loss": 1.6291, "loss/crossentropy": 2.440064311027527, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.21506782621145248, "step": 5421 }, { "epoch": 0.0809622290744294, "grad_norm": 0.4140625, "grad_norm_var": 0.0016509373982747397, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.725707530975342, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.25893354415893555, "step": 5422 }, { "epoch": 0.08097716124504438, "grad_norm": 0.345703125, "grad_norm_var": 0.0016795953114827475, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.4502882957458496, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19464798271656036, "step": 5423 }, { "epoch": 0.08099209341565937, "grad_norm": 0.330078125, "grad_norm_var": 0.0018273512522379556, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.690967321395874, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.2065763920545578, "step": 5424 }, { "epoch": 0.08100702558627434, "grad_norm": 0.455078125, "grad_norm_var": 0.001828447977701823, "learning_rate": 0.0001, "loss": 1.789, "loss/crossentropy": 2.5387582778930664, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.30070438981056213, "step": 5425 }, { "epoch": 0.08102195775688933, "grad_norm": 0.369140625, "grad_norm_var": 0.0018477757771809896, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.8431812524795532, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22396066784858704, "step": 5426 }, { "epoch": 0.08103688992750431, "grad_norm": 0.376953125, "grad_norm_var": 0.0017059167226155599, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.4329633712768555, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17926125228405, "step": 5427 }, { "epoch": 0.0810518220981193, "grad_norm": 0.373046875, "grad_norm_var": 0.0016362508138020833, "learning_rate": 0.0001, "loss": 1.5738, "loss/crossentropy": 2.768356680870056, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.23000672459602356, "step": 5428 }, { "epoch": 0.08106675426873428, "grad_norm": 0.421875, "grad_norm_var": 0.0015166600545247395, "learning_rate": 0.0001, "loss": 1.5702, "loss/crossentropy": 2.7063465118408203, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2186141237616539, "step": 5429 }, { "epoch": 0.08108168643934925, "grad_norm": 0.392578125, "grad_norm_var": 0.001479323705037435, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.611118793487549, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.21707789599895477, "step": 5430 }, { "epoch": 0.08109661860996424, "grad_norm": 0.5078125, "grad_norm_var": 0.0022048314412434896, "learning_rate": 0.0001, "loss": 1.5958, "loss/crossentropy": 2.478966236114502, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.21688276529312134, "step": 5431 }, { "epoch": 0.08111155078057922, "grad_norm": 0.421875, "grad_norm_var": 0.0020311832427978515, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.5277191400527954, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.20495034754276276, "step": 5432 }, { "epoch": 0.0811264829511942, "grad_norm": 0.423828125, "grad_norm_var": 0.0020360310872395834, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.7304840087890625, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.17958004772663116, "step": 5433 }, { "epoch": 0.08114141512180918, "grad_norm": 0.3828125, "grad_norm_var": 0.001904741923014323, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.5808498859405518, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22345828264951706, "step": 5434 }, { "epoch": 0.08115634729242416, "grad_norm": 0.408203125, "grad_norm_var": 0.0018828709920247395, "learning_rate": 0.0001, "loss": 1.5987, "loss/crossentropy": 2.5756278038024902, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.21590203046798706, "step": 5435 }, { "epoch": 0.08117127946303915, "grad_norm": 0.373046875, "grad_norm_var": 0.0018634637196858724, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.593587636947632, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1882796734571457, "step": 5436 }, { "epoch": 0.08118621163365412, "grad_norm": 0.3359375, "grad_norm_var": 0.002092425028483073, "learning_rate": 0.0001, "loss": 1.534, "loss/crossentropy": 2.684492349624634, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19801992177963257, "step": 5437 }, { "epoch": 0.08120114380426911, "grad_norm": 0.453125, "grad_norm_var": 0.002283159891764323, "learning_rate": 0.0001, "loss": 1.637, "loss/crossentropy": 2.7786948680877686, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.21509917080402374, "step": 5438 }, { "epoch": 0.08121607597488409, "grad_norm": 0.3515625, "grad_norm_var": 0.002244297663370768, "learning_rate": 0.0001, "loss": 1.4948, "loss/crossentropy": 2.405872106552124, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2018725872039795, "step": 5439 }, { "epoch": 0.08123100814549906, "grad_norm": 0.3828125, "grad_norm_var": 0.0019365946451822917, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.5734708309173584, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1828325167298317, "step": 5440 }, { "epoch": 0.08124594031611405, "grad_norm": 0.33984375, "grad_norm_var": 0.0019487857818603516, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.6180174350738525, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17478401213884354, "step": 5441 }, { "epoch": 0.08126087248672903, "grad_norm": 0.47265625, "grad_norm_var": 0.002266375223795573, "learning_rate": 0.0001, "loss": 1.6488, "loss/crossentropy": 2.8826061487197876, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.2425864264369011, "step": 5442 }, { "epoch": 0.08127580465734402, "grad_norm": 0.373046875, "grad_norm_var": 0.0022799173990885415, "learning_rate": 0.0001, "loss": 1.5564, "loss/crossentropy": 2.6700328588485718, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2126794382929802, "step": 5443 }, { "epoch": 0.081290736827959, "grad_norm": 0.328125, "grad_norm_var": 0.0025727430979410808, "learning_rate": 0.0001, "loss": 1.4742, "loss/crossentropy": 2.711057186126709, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20073194801807404, "step": 5444 }, { "epoch": 0.08130566899857397, "grad_norm": 0.37109375, "grad_norm_var": 0.0025727430979410808, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.4069786071777344, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.17821653932332993, "step": 5445 }, { "epoch": 0.08132060116918896, "grad_norm": 0.35546875, "grad_norm_var": 0.0026702880859375, "learning_rate": 0.0001, "loss": 1.726, "loss/crossentropy": 2.5286206007003784, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.253304585814476, "step": 5446 }, { "epoch": 0.08133553333980394, "grad_norm": 0.33984375, "grad_norm_var": 0.0018528620402018228, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.648185968399048, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19922512024641037, "step": 5447 }, { "epoch": 0.08135046551041893, "grad_norm": 0.365234375, "grad_norm_var": 0.0017528374989827474, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.6130778789520264, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20750685781240463, "step": 5448 }, { "epoch": 0.0813653976810339, "grad_norm": 0.388671875, "grad_norm_var": 0.0016177972157796225, "learning_rate": 0.0001, "loss": 1.6013, "loss/crossentropy": 2.6507840156555176, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22240936011075974, "step": 5449 }, { "epoch": 0.08138032985164889, "grad_norm": 0.337890625, "grad_norm_var": 0.001705169677734375, "learning_rate": 0.0001, "loss": 1.4555, "loss/crossentropy": 2.4775854349136353, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18595945835113525, "step": 5450 }, { "epoch": 0.08139526202226387, "grad_norm": 0.353515625, "grad_norm_var": 0.0016393025716145834, "learning_rate": 0.0001, "loss": 1.5325, "loss/crossentropy": 2.7337504625320435, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20831398665905, "step": 5451 }, { "epoch": 0.08141019419287884, "grad_norm": 0.361328125, "grad_norm_var": 0.0016433080037434897, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.637825131416321, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20342648774385452, "step": 5452 }, { "epoch": 0.08142512636349383, "grad_norm": 0.353515625, "grad_norm_var": 0.0015842278798421225, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.6669163703918457, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2189321592450142, "step": 5453 }, { "epoch": 0.08144005853410881, "grad_norm": 0.333984375, "grad_norm_var": 0.0011585871378580728, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.4245166778564453, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18739498406648636, "step": 5454 }, { "epoch": 0.0814549907047238, "grad_norm": 0.3515625, "grad_norm_var": 0.0011585871378580728, "learning_rate": 0.0001, "loss": 1.4627, "loss/crossentropy": 2.553678274154663, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1971074491739273, "step": 5455 }, { "epoch": 0.08146992287533877, "grad_norm": 0.36328125, "grad_norm_var": 0.0011309305826822916, "learning_rate": 0.0001, "loss": 1.6291, "loss/crossentropy": 2.5830676555633545, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.2345622256398201, "step": 5456 }, { "epoch": 0.08148485504595375, "grad_norm": 0.36328125, "grad_norm_var": 0.0010965983072916666, "learning_rate": 0.0001, "loss": 1.5234, "loss/crossentropy": 2.5564099550247192, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20309948921203613, "step": 5457 }, { "epoch": 0.08149978721656874, "grad_norm": 0.3359375, "grad_norm_var": 0.00027103424072265624, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.618611693382263, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2064334601163864, "step": 5458 }, { "epoch": 0.08151471938718372, "grad_norm": 0.369140625, "grad_norm_var": 0.000262451171875, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.5447713136672974, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.18840189278125763, "step": 5459 }, { "epoch": 0.0815296515577987, "grad_norm": 0.3515625, "grad_norm_var": 0.000214385986328125, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.381119132041931, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2161390781402588, "step": 5460 }, { "epoch": 0.08154458372841368, "grad_norm": 0.400390625, "grad_norm_var": 0.0003271579742431641, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.647617220878601, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.22970911860466003, "step": 5461 }, { "epoch": 0.08155951589902866, "grad_norm": 0.365234375, "grad_norm_var": 0.0003300984700520833, "learning_rate": 0.0001, "loss": 1.5883, "loss/crossentropy": 2.78229022026062, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.22505014389753342, "step": 5462 }, { "epoch": 0.08157444806964365, "grad_norm": 0.36328125, "grad_norm_var": 0.0003064473470052083, "learning_rate": 0.0001, "loss": 1.6016, "loss/crossentropy": 2.5234415531158447, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21093739569187164, "step": 5463 }, { "epoch": 0.08158938024025862, "grad_norm": 0.33203125, "grad_norm_var": 0.00035157203674316404, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.7206767797470093, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18201202154159546, "step": 5464 }, { "epoch": 0.08160431241087361, "grad_norm": 0.341796875, "grad_norm_var": 0.00029587745666503906, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.6061517000198364, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.170524001121521, "step": 5465 }, { "epoch": 0.08161924458148859, "grad_norm": 0.43359375, "grad_norm_var": 0.0006518046061197917, "learning_rate": 0.0001, "loss": 1.7789, "loss/crossentropy": 2.4099819660186768, "loss/fcd": 1.54296875, "loss/idx": 10.0, "loss/logits": 0.23597851395606995, "step": 5466 }, { "epoch": 0.08163417675210356, "grad_norm": 0.4140625, "grad_norm_var": 0.0008217970530192058, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.455990195274353, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19197824597358704, "step": 5467 }, { "epoch": 0.08164910892271855, "grad_norm": 0.37890625, "grad_norm_var": 0.0008333841959635417, "learning_rate": 0.0001, "loss": 1.7033, "loss/crossentropy": 2.5657607316970825, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.25408486276865005, "step": 5468 }, { "epoch": 0.08166404109333353, "grad_norm": 0.388671875, "grad_norm_var": 0.0008534113566080729, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.7134238481521606, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20821237564086914, "step": 5469 }, { "epoch": 0.08167897326394852, "grad_norm": 0.388671875, "grad_norm_var": 0.0007928848266601562, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.5595563650131226, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20993580669164658, "step": 5470 }, { "epoch": 0.0816939054345635, "grad_norm": 0.345703125, "grad_norm_var": 0.0008104801177978516, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.574398636817932, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1953570768237114, "step": 5471 }, { "epoch": 0.08170883760517848, "grad_norm": 0.44921875, "grad_norm_var": 0.001183938980102539, "learning_rate": 0.0001, "loss": 1.5065, "loss/crossentropy": 2.50656521320343, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19404692202806473, "step": 5472 }, { "epoch": 0.08172376977579346, "grad_norm": 0.37109375, "grad_norm_var": 0.001174147923787435, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.4839212894439697, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21660484373569489, "step": 5473 }, { "epoch": 0.08173870194640843, "grad_norm": 0.423828125, "grad_norm_var": 0.0011777242024739584, "learning_rate": 0.0001, "loss": 1.6476, "loss/crossentropy": 2.6408207416534424, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.21405573189258575, "step": 5474 }, { "epoch": 0.08175363411702342, "grad_norm": 0.349609375, "grad_norm_var": 0.0012358983357747396, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.5839065313339233, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18445315957069397, "step": 5475 }, { "epoch": 0.0817685662876384, "grad_norm": 0.380859375, "grad_norm_var": 0.001174147923787435, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.5873008966445923, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19200889766216278, "step": 5476 }, { "epoch": 0.08178349845825339, "grad_norm": 0.353515625, "grad_norm_var": 0.0012023766835530599, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 2.444553017616272, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.17654526233673096, "step": 5477 }, { "epoch": 0.08179843062886837, "grad_norm": 0.37109375, "grad_norm_var": 0.0011929829915364584, "learning_rate": 0.0001, "loss": 1.4842, "loss/crossentropy": 2.6247568130493164, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18729791790246964, "step": 5478 }, { "epoch": 0.08181336279948334, "grad_norm": 0.37890625, "grad_norm_var": 0.001172637939453125, "learning_rate": 0.0001, "loss": 1.5908, "loss/crossentropy": 2.626042604446411, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.231385700404644, "step": 5479 }, { "epoch": 0.08182829497009833, "grad_norm": 0.349609375, "grad_norm_var": 0.0010763645172119141, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.489779829978943, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.17994952946901321, "step": 5480 }, { "epoch": 0.0818432271407133, "grad_norm": 0.375, "grad_norm_var": 0.0009653091430664063, "learning_rate": 0.0001, "loss": 1.4567, "loss/crossentropy": 2.486638069152832, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.17153123021125793, "step": 5481 }, { "epoch": 0.0818581593113283, "grad_norm": 0.349609375, "grad_norm_var": 0.0008566379547119141, "learning_rate": 0.0001, "loss": 1.345, "loss/crossentropy": 2.6196545362472534, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1575259268283844, "step": 5482 }, { "epoch": 0.08187309148194327, "grad_norm": 0.341796875, "grad_norm_var": 0.0008478164672851562, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.6533032655715942, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20491177588701248, "step": 5483 }, { "epoch": 0.08188802365255825, "grad_norm": 0.361328125, "grad_norm_var": 0.000857400894165039, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.4587814807891846, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17515377700328827, "step": 5484 }, { "epoch": 0.08190295582317324, "grad_norm": 0.423828125, "grad_norm_var": 0.0010050296783447265, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.423388361930847, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.1971541792154312, "step": 5485 }, { "epoch": 0.08191788799378821, "grad_norm": 0.306640625, "grad_norm_var": 0.001285409927368164, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.4757630825042725, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19695696979761124, "step": 5486 }, { "epoch": 0.0819328201644032, "grad_norm": 0.419921875, "grad_norm_var": 0.0013820489247639975, "learning_rate": 0.0001, "loss": 1.5439, "loss/crossentropy": 2.2668449878692627, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.18454258143901825, "step": 5487 }, { "epoch": 0.08194775233501818, "grad_norm": 0.36328125, "grad_norm_var": 0.0009974002838134765, "learning_rate": 0.0001, "loss": 1.4921, "loss/crossentropy": 2.6063274145126343, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.2108549103140831, "step": 5488 }, { "epoch": 0.08196268450563317, "grad_norm": 0.388671875, "grad_norm_var": 0.001019287109375, "learning_rate": 0.0001, "loss": 1.4353, "loss/crossentropy": 2.574995994567871, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18140264600515366, "step": 5489 }, { "epoch": 0.08197761667624814, "grad_norm": 0.376953125, "grad_norm_var": 0.0008270263671875, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.86787486076355, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2128635123372078, "step": 5490 }, { "epoch": 0.08199254884686312, "grad_norm": 0.3359375, "grad_norm_var": 0.0008725325266520182, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 2.5014158487319946, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18704965710639954, "step": 5491 }, { "epoch": 0.08200748101747811, "grad_norm": 0.357421875, "grad_norm_var": 0.0008645216623942057, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.6822906732559204, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17059531062841415, "step": 5492 }, { "epoch": 0.08202241318809309, "grad_norm": 0.388671875, "grad_norm_var": 0.0008839766184488932, "learning_rate": 0.0001, "loss": 1.4758, "loss/crossentropy": 2.540347099304199, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18669991195201874, "step": 5493 }, { "epoch": 0.08203734535870807, "grad_norm": 0.40234375, "grad_norm_var": 0.0009577274322509766, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.81610643863678, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21098614484071732, "step": 5494 }, { "epoch": 0.08205227752932305, "grad_norm": 0.345703125, "grad_norm_var": 0.0009871800740559897, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.6108005046844482, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19133253395557404, "step": 5495 }, { "epoch": 0.08206720969993803, "grad_norm": 0.365234375, "grad_norm_var": 0.0009642918904622396, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.68694007396698, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19731876999139786, "step": 5496 }, { "epoch": 0.08208214187055302, "grad_norm": 0.373046875, "grad_norm_var": 0.0009629408518473308, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.2962825298309326, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20709238946437836, "step": 5497 }, { "epoch": 0.08209707404116799, "grad_norm": 0.4609375, "grad_norm_var": 0.001453081766764323, "learning_rate": 0.0001, "loss": 1.649, "loss/crossentropy": 2.4114400148391724, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.19192852079868317, "step": 5498 }, { "epoch": 0.08211200621178298, "grad_norm": 0.357421875, "grad_norm_var": 0.0013976414998372396, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.9759472608566284, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1971515715122223, "step": 5499 }, { "epoch": 0.08212693838239796, "grad_norm": 0.361328125, "grad_norm_var": 0.0013976414998372396, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.4901498556137085, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17580923438072205, "step": 5500 }, { "epoch": 0.08214187055301293, "grad_norm": 0.369140625, "grad_norm_var": 0.001240984598795573, "learning_rate": 0.0001, "loss": 1.5376, "loss/crossentropy": 2.6372565031051636, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20944207906723022, "step": 5501 }, { "epoch": 0.08215680272362792, "grad_norm": 0.40234375, "grad_norm_var": 0.0009629408518473308, "learning_rate": 0.0001, "loss": 1.6506, "loss/crossentropy": 2.7253841161727905, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.22875021398067474, "step": 5502 }, { "epoch": 0.0821717348942429, "grad_norm": 0.89453125, "grad_norm_var": 0.017613665262858073, "learning_rate": 0.0001, "loss": 2.0821, "loss/crossentropy": 2.6855746507644653, "loss/fcd": 1.65625, "loss/idx": 10.0, "loss/logits": 0.425832137465477, "step": 5503 }, { "epoch": 0.08218666706485789, "grad_norm": 0.34765625, "grad_norm_var": 0.017724037170410156, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.6467961072921753, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22778689861297607, "step": 5504 }, { "epoch": 0.08220159923547286, "grad_norm": 0.380859375, "grad_norm_var": 0.017747942606608072, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.7022976875305176, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1927802786231041, "step": 5505 }, { "epoch": 0.08221653140608784, "grad_norm": 0.380859375, "grad_norm_var": 0.017733001708984376, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.5722124576568604, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.23435188084840775, "step": 5506 }, { "epoch": 0.08223146357670283, "grad_norm": 0.361328125, "grad_norm_var": 0.017530298233032225, "learning_rate": 0.0001, "loss": 1.5187, "loss/crossentropy": 2.628422737121582, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19836919009685516, "step": 5507 }, { "epoch": 0.0822463957473178, "grad_norm": 1.4296875, "grad_norm_var": 0.08197269439697266, "learning_rate": 0.0001, "loss": 2.1074, "loss/crossentropy": 2.539982318878174, "loss/fcd": 1.7578125, "loss/idx": 10.0, "loss/logits": 0.3495393693447113, "step": 5508 }, { "epoch": 0.0822613279179328, "grad_norm": 0.39453125, "grad_norm_var": 0.0819063663482666, "learning_rate": 0.0001, "loss": 1.5824, "loss/crossentropy": 2.3908376693725586, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.20354107022285461, "step": 5509 }, { "epoch": 0.08227626008854777, "grad_norm": 0.42578125, "grad_norm_var": 0.08170838356018066, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.6010348796844482, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.196406289935112, "step": 5510 }, { "epoch": 0.08229119225916276, "grad_norm": 0.35546875, "grad_norm_var": 0.08154188791910807, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.452567219734192, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18606461584568024, "step": 5511 }, { "epoch": 0.08230612442977774, "grad_norm": 0.416015625, "grad_norm_var": 0.0809343973795573, "learning_rate": 0.0001, "loss": 1.4471, "loss/crossentropy": 2.349696636199951, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.173691026866436, "step": 5512 }, { "epoch": 0.08232105660039271, "grad_norm": 0.345703125, "grad_norm_var": 0.08137811024983724, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.6796014308929443, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19316434860229492, "step": 5513 }, { "epoch": 0.0823359887710077, "grad_norm": 0.353515625, "grad_norm_var": 0.08237557411193848, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.485528588294983, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.16349846497178078, "step": 5514 }, { "epoch": 0.08235092094162268, "grad_norm": 0.38671875, "grad_norm_var": 0.08197574615478516, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.663368821144104, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1892092302441597, "step": 5515 }, { "epoch": 0.08236585311223767, "grad_norm": 0.37890625, "grad_norm_var": 0.08172783851623536, "learning_rate": 0.0001, "loss": 1.4912, "loss/crossentropy": 2.651310920715332, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20217514783143997, "step": 5516 }, { "epoch": 0.08238078528285264, "grad_norm": 1.28125, "grad_norm_var": 0.12067508697509766, "learning_rate": 0.0001, "loss": 1.8469, "loss/crossentropy": 2.2066505551338196, "loss/fcd": 1.5625, "loss/idx": 10.0, "loss/logits": 0.2844357416033745, "step": 5517 }, { "epoch": 0.08239571745346762, "grad_norm": 0.353515625, "grad_norm_var": 0.12167763710021973, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.5672073364257812, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17133712023496628, "step": 5518 }, { "epoch": 0.08241064962408261, "grad_norm": 0.412109375, "grad_norm_var": 0.11280104319254557, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.554343819618225, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.16187676787376404, "step": 5519 }, { "epoch": 0.08242558179469758, "grad_norm": 0.34765625, "grad_norm_var": 0.11280104319254557, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.5940476655960083, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19883617758750916, "step": 5520 }, { "epoch": 0.08244051396531257, "grad_norm": 0.349609375, "grad_norm_var": 0.11335951487223307, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.5459851026535034, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17229372262954712, "step": 5521 }, { "epoch": 0.08245544613592755, "grad_norm": 0.5234375, "grad_norm_var": 0.11239762306213379, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.845401644706726, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2058592140674591, "step": 5522 }, { "epoch": 0.08247037830654252, "grad_norm": 0.392578125, "grad_norm_var": 0.11185084978739421, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.6565407514572144, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19084589183330536, "step": 5523 }, { "epoch": 0.08248531047715751, "grad_norm": 0.37109375, "grad_norm_var": 0.051960357030232746, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.5198848247528076, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20774662494659424, "step": 5524 }, { "epoch": 0.08250024264777249, "grad_norm": 0.474609375, "grad_norm_var": 0.05184370676676432, "learning_rate": 0.0001, "loss": 1.5995, "loss/crossentropy": 2.565003514289856, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.20886456966400146, "step": 5525 }, { "epoch": 0.08251517481838748, "grad_norm": 0.40625, "grad_norm_var": 0.051925404866536455, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.6425576210021973, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18324033170938492, "step": 5526 }, { "epoch": 0.08253010698900246, "grad_norm": 0.427734375, "grad_norm_var": 0.05137200355529785, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.7804064750671387, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.19453420490026474, "step": 5527 }, { "epoch": 0.08254503915961743, "grad_norm": 0.44140625, "grad_norm_var": 0.05129286448160807, "learning_rate": 0.0001, "loss": 1.5386, "loss/crossentropy": 2.759593963623047, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20268774777650833, "step": 5528 }, { "epoch": 0.08255997133023242, "grad_norm": 0.41796875, "grad_norm_var": 0.05058655738830566, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.5857701301574707, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.15973427891731262, "step": 5529 }, { "epoch": 0.0825749035008474, "grad_norm": 0.349609375, "grad_norm_var": 0.05064161618550619, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.4884949922561646, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1682010143995285, "step": 5530 }, { "epoch": 0.08258983567146239, "grad_norm": 0.333984375, "grad_norm_var": 0.05131066640218099, "learning_rate": 0.0001, "loss": 1.5281, "loss/crossentropy": 2.7343311309814453, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2117360234260559, "step": 5531 }, { "epoch": 0.08260476784207736, "grad_norm": 0.3125, "grad_norm_var": 0.052249908447265625, "learning_rate": 0.0001, "loss": 1.3395, "loss/crossentropy": 2.6127774715423584, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.15981824696063995, "step": 5532 }, { "epoch": 0.08261970001269235, "grad_norm": 0.3828125, "grad_norm_var": 0.0030873616536458335, "learning_rate": 0.0001, "loss": 1.543, "loss/crossentropy": 2.520699381828308, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2070363461971283, "step": 5533 }, { "epoch": 0.08263463218330733, "grad_norm": 0.333984375, "grad_norm_var": 0.0032154719034830728, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.800151228904724, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19196010380983353, "step": 5534 }, { "epoch": 0.0826495643539223, "grad_norm": 0.3671875, "grad_norm_var": 0.0032231489817301433, "learning_rate": 0.0001, "loss": 1.5518, "loss/crossentropy": 2.492287278175354, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2198084071278572, "step": 5535 }, { "epoch": 0.08266449652453729, "grad_norm": 0.62890625, "grad_norm_var": 0.006596867243448893, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.6100443601608276, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19246964156627655, "step": 5536 }, { "epoch": 0.08267942869515227, "grad_norm": 0.328125, "grad_norm_var": 0.0067904154459635414, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.5919820070266724, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18428125977516174, "step": 5537 }, { "epoch": 0.08269436086576726, "grad_norm": 0.3828125, "grad_norm_var": 0.005819956461588542, "learning_rate": 0.0001, "loss": 1.5759, "loss/crossentropy": 2.7604089975357056, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.23215198516845703, "step": 5538 }, { "epoch": 0.08270929303638223, "grad_norm": 0.40625, "grad_norm_var": 0.0058236281077067055, "learning_rate": 0.0001, "loss": 1.7355, "loss/crossentropy": 2.4519771337509155, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.28232693672180176, "step": 5539 }, { "epoch": 0.08272422520699721, "grad_norm": 0.35546875, "grad_norm_var": 0.005894581476847331, "learning_rate": 0.0001, "loss": 1.5105, "loss/crossentropy": 2.554636001586914, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2018633559346199, "step": 5540 }, { "epoch": 0.0827391573776122, "grad_norm": 0.408203125, "grad_norm_var": 0.005481704076131185, "learning_rate": 0.0001, "loss": 1.4773, "loss/crossentropy": 2.548604369163513, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.21554343402385712, "step": 5541 }, { "epoch": 0.08275408954822717, "grad_norm": 0.56640625, "grad_norm_var": 0.007374175389607747, "learning_rate": 0.0001, "loss": 1.8168, "loss/crossentropy": 2.233923554420471, "loss/fcd": 1.58203125, "loss/idx": 10.0, "loss/logits": 0.23478785157203674, "step": 5542 }, { "epoch": 0.08276902171884216, "grad_norm": 0.3828125, "grad_norm_var": 0.007350413004557291, "learning_rate": 0.0001, "loss": 1.7362, "loss/crossentropy": 2.5675052404403687, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.25182371586561203, "step": 5543 }, { "epoch": 0.08278395388945714, "grad_norm": 0.359375, "grad_norm_var": 0.0073170344034830725, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.7832783460617065, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17787662148475647, "step": 5544 }, { "epoch": 0.08279888606007212, "grad_norm": 0.353515625, "grad_norm_var": 0.007377354303995768, "learning_rate": 0.0001, "loss": 1.5239, "loss/crossentropy": 2.3495850563049316, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.18405850231647491, "step": 5545 }, { "epoch": 0.0828138182306871, "grad_norm": 0.408203125, "grad_norm_var": 0.007270542780558268, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.8673548698425293, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.23069600760936737, "step": 5546 }, { "epoch": 0.08282875040130208, "grad_norm": 0.373046875, "grad_norm_var": 0.00705119768778483, "learning_rate": 0.0001, "loss": 1.4758, "loss/crossentropy": 2.4172942638397217, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.17889203131198883, "step": 5547 }, { "epoch": 0.08284368257191707, "grad_norm": 0.365234375, "grad_norm_var": 0.006631914774576823, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.5299898386001587, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20802882313728333, "step": 5548 }, { "epoch": 0.08285861474253205, "grad_norm": 0.3984375, "grad_norm_var": 0.006611061096191406, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.5398589372634888, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19212783128023148, "step": 5549 }, { "epoch": 0.08287354691314704, "grad_norm": 0.41796875, "grad_norm_var": 0.006300083796183268, "learning_rate": 0.0001, "loss": 1.4768, "loss/crossentropy": 2.4064908027648926, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1877315267920494, "step": 5550 }, { "epoch": 0.08288847908376201, "grad_norm": 0.93359375, "grad_norm_var": 0.02339183489481608, "learning_rate": 0.0001, "loss": 1.8618, "loss/crossentropy": 2.30444997549057, "loss/fcd": 1.66015625, "loss/idx": 10.0, "loss/logits": 0.20160426944494247, "step": 5551 }, { "epoch": 0.08290341125437699, "grad_norm": 0.341796875, "grad_norm_var": 0.021380106608072918, "learning_rate": 0.0001, "loss": 1.4401, "loss/crossentropy": 2.565819501876831, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.17841650545597076, "step": 5552 }, { "epoch": 0.08291834342499198, "grad_norm": 0.400390625, "grad_norm_var": 0.020784362157185873, "learning_rate": 0.0001, "loss": 1.415, "loss/crossentropy": 2.680217742919922, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17284070700407028, "step": 5553 }, { "epoch": 0.08293327559560695, "grad_norm": 0.39453125, "grad_norm_var": 0.020721801122029624, "learning_rate": 0.0001, "loss": 1.5304, "loss/crossentropy": 2.7567001581192017, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.18270929157733917, "step": 5554 }, { "epoch": 0.08294820776622194, "grad_norm": 0.3984375, "grad_norm_var": 0.020749394098917642, "learning_rate": 0.0001, "loss": 1.7003, "loss/crossentropy": 2.4872461557388306, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.24324534088373184, "step": 5555 }, { "epoch": 0.08296313993683692, "grad_norm": 0.384765625, "grad_norm_var": 0.020517412821451822, "learning_rate": 0.0001, "loss": 1.7252, "loss/crossentropy": 2.62427294254303, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.27207469940185547, "step": 5556 }, { "epoch": 0.0829780721074519, "grad_norm": 0.34765625, "grad_norm_var": 0.02092588742574056, "learning_rate": 0.0001, "loss": 1.5719, "loss/crossentropy": 2.562120795249939, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.23207709193229675, "step": 5557 }, { "epoch": 0.08299300427806688, "grad_norm": 0.38671875, "grad_norm_var": 0.01959519386291504, "learning_rate": 0.0001, "loss": 1.6826, "loss/crossentropy": 2.682452082633972, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2490188479423523, "step": 5558 }, { "epoch": 0.08300793644868186, "grad_norm": 0.427734375, "grad_norm_var": 0.019526100158691405, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.436118245124817, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.2421497404575348, "step": 5559 }, { "epoch": 0.08302286861929685, "grad_norm": 0.369140625, "grad_norm_var": 0.019455448786417643, "learning_rate": 0.0001, "loss": 1.5407, "loss/crossentropy": 2.773695945739746, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2047533318400383, "step": 5560 }, { "epoch": 0.08303780078991183, "grad_norm": 0.408203125, "grad_norm_var": 0.01916616757710775, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.5575441122055054, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.21999594569206238, "step": 5561 }, { "epoch": 0.0830527329605268, "grad_norm": 0.3515625, "grad_norm_var": 0.019472694396972655, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.4566060304641724, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21767067909240723, "step": 5562 }, { "epoch": 0.08306766513114179, "grad_norm": 0.3515625, "grad_norm_var": 0.019632323582967123, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.612049698829651, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.203671433031559, "step": 5563 }, { "epoch": 0.08308259730175677, "grad_norm": 0.365234375, "grad_norm_var": 0.019632323582967123, "learning_rate": 0.0001, "loss": 1.5227, "loss/crossentropy": 2.5551841259002686, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20631542801856995, "step": 5564 }, { "epoch": 0.08309752947237176, "grad_norm": 0.3359375, "grad_norm_var": 0.020034138361612955, "learning_rate": 0.0001, "loss": 1.4415, "loss/crossentropy": 2.5987898111343384, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.17982706427574158, "step": 5565 }, { "epoch": 0.08311246164298673, "grad_norm": 0.408203125, "grad_norm_var": 0.020034217834472658, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.7412527799606323, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1812990978360176, "step": 5566 }, { "epoch": 0.08312739381360171, "grad_norm": 0.37890625, "grad_norm_var": 0.0007501602172851563, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.5513503551483154, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2424437552690506, "step": 5567 }, { "epoch": 0.0831423259842167, "grad_norm": 0.357421875, "grad_norm_var": 0.0006896336873372396, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.62576687335968, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.18853989988565445, "step": 5568 }, { "epoch": 0.08315725815483167, "grad_norm": 0.365234375, "grad_norm_var": 0.0006673177083333333, "learning_rate": 0.0001, "loss": 1.6629, "loss/crossentropy": 2.2360164523124695, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.2176261767745018, "step": 5569 }, { "epoch": 0.08317219032544666, "grad_norm": 0.64453125, "grad_norm_var": 0.005159505208333333, "learning_rate": 0.0001, "loss": 1.4999, "loss/crossentropy": 2.495660662651062, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19128231704235077, "step": 5570 }, { "epoch": 0.08318712249606164, "grad_norm": 0.45703125, "grad_norm_var": 0.0054198582967122395, "learning_rate": 0.0001, "loss": 1.7443, "loss/crossentropy": 2.4266387224197388, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.2326010912656784, "step": 5571 }, { "epoch": 0.08320205466667663, "grad_norm": 0.35546875, "grad_norm_var": 0.005518325169881185, "learning_rate": 0.0001, "loss": 1.5385, "loss/crossentropy": 2.5038766860961914, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2142900675535202, "step": 5572 }, { "epoch": 0.0832169868372916, "grad_norm": 0.392578125, "grad_norm_var": 0.005364418029785156, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.8409790992736816, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19053920358419418, "step": 5573 }, { "epoch": 0.08323191900790658, "grad_norm": 0.40625, "grad_norm_var": 0.005360921223958333, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.529631495475769, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.24142106622457504, "step": 5574 }, { "epoch": 0.08324685117852157, "grad_norm": 0.376953125, "grad_norm_var": 0.005323727925618489, "learning_rate": 0.0001, "loss": 1.4039, "loss/crossentropy": 2.696580648422241, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18512921035289764, "step": 5575 }, { "epoch": 0.08326178334913654, "grad_norm": 0.33203125, "grad_norm_var": 0.005539051691691081, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.617701530456543, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20683394372463226, "step": 5576 }, { "epoch": 0.08327671551975153, "grad_norm": 0.376953125, "grad_norm_var": 0.005536508560180664, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.3734161853790283, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1799619495868683, "step": 5577 }, { "epoch": 0.08329164769036651, "grad_norm": 2.328125, "grad_norm_var": 0.23932035764058432, "learning_rate": 0.0001, "loss": 1.9381, "loss/crossentropy": 2.7846760749816895, "loss/fcd": 1.5546875, "loss/idx": 10.0, "loss/logits": 0.38344521075487137, "step": 5578 }, { "epoch": 0.08330657986098149, "grad_norm": 0.40625, "grad_norm_var": 0.23831899960835776, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.4422723054885864, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.22708148509263992, "step": 5579 }, { "epoch": 0.08332151203159648, "grad_norm": 0.375, "grad_norm_var": 0.2381261189778646, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.6065824031829834, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16356902569532394, "step": 5580 }, { "epoch": 0.08333644420221145, "grad_norm": 0.384765625, "grad_norm_var": 0.23708621660868326, "learning_rate": 0.0001, "loss": 1.5667, "loss/crossentropy": 2.606644034385681, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21906017512083054, "step": 5581 }, { "epoch": 0.08335137637282644, "grad_norm": 0.365234375, "grad_norm_var": 0.2378513177235921, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.5923006534576416, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19720075279474258, "step": 5582 }, { "epoch": 0.08336630854344142, "grad_norm": 0.408203125, "grad_norm_var": 0.23735802968343098, "learning_rate": 0.0001, "loss": 1.6297, "loss/crossentropy": 2.4227694272994995, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.21951452642679214, "step": 5583 }, { "epoch": 0.08338124071405639, "grad_norm": 0.40234375, "grad_norm_var": 0.23650587399800618, "learning_rate": 0.0001, "loss": 1.7115, "loss/crossentropy": 2.462417960166931, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.27403856813907623, "step": 5584 }, { "epoch": 0.08339617288467138, "grad_norm": 0.369140625, "grad_norm_var": 0.23642436663309732, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.596983551979065, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19767680764198303, "step": 5585 }, { "epoch": 0.08341110505528636, "grad_norm": 0.50390625, "grad_norm_var": 0.23539668718973797, "learning_rate": 0.0001, "loss": 1.5203, "loss/crossentropy": 2.804142475128174, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.196048803627491, "step": 5586 }, { "epoch": 0.08342603722590135, "grad_norm": 0.3828125, "grad_norm_var": 0.23631475766499838, "learning_rate": 0.0001, "loss": 1.547, "loss/crossentropy": 2.4497811794281006, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21110451966524124, "step": 5587 }, { "epoch": 0.08344096939651632, "grad_norm": 0.400390625, "grad_norm_var": 0.23551305135091147, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.623879909515381, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18676754087209702, "step": 5588 }, { "epoch": 0.0834559015671313, "grad_norm": 0.400390625, "grad_norm_var": 0.2353912353515625, "learning_rate": 0.0001, "loss": 1.5792, "loss/crossentropy": 2.6907877922058105, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21986443549394608, "step": 5589 }, { "epoch": 0.08347083373774629, "grad_norm": 0.46484375, "grad_norm_var": 0.2347665786743164, "learning_rate": 0.0001, "loss": 1.7578, "loss/crossentropy": 2.6984134912490845, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.26947182416915894, "step": 5590 }, { "epoch": 0.08348576590836126, "grad_norm": 0.40625, "grad_norm_var": 0.23427186012268067, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.734674334526062, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.18723345547914505, "step": 5591 }, { "epoch": 0.08350069807897625, "grad_norm": 0.43359375, "grad_norm_var": 0.2323824405670166, "learning_rate": 0.0001, "loss": 1.5592, "loss/crossentropy": 2.4464415311813354, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.207642063498497, "step": 5592 }, { "epoch": 0.08351563024959123, "grad_norm": 0.37890625, "grad_norm_var": 0.2323439915974935, "learning_rate": 0.0001, "loss": 1.6087, "loss/crossentropy": 2.6454904079437256, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.22584786266088486, "step": 5593 }, { "epoch": 0.08353056242020622, "grad_norm": 0.34765625, "grad_norm_var": 0.0015149434407552083, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.841018557548523, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22486796975135803, "step": 5594 }, { "epoch": 0.0835454945908212, "grad_norm": 0.353515625, "grad_norm_var": 0.001657851537068685, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.536271333694458, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.17650936543941498, "step": 5595 }, { "epoch": 0.08356042676143617, "grad_norm": 0.373046875, "grad_norm_var": 0.0016642252604166666, "learning_rate": 0.0001, "loss": 1.4882, "loss/crossentropy": 2.377719521522522, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18739980459213257, "step": 5596 }, { "epoch": 0.08357535893205116, "grad_norm": 0.3203125, "grad_norm_var": 0.0020413557688395184, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.4728150367736816, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1794436275959015, "step": 5597 }, { "epoch": 0.08359029110266614, "grad_norm": 0.33203125, "grad_norm_var": 0.002239418029785156, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.5218008756637573, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19322840124368668, "step": 5598 }, { "epoch": 0.08360522327328113, "grad_norm": 0.341796875, "grad_norm_var": 0.0023745218912760418, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.624788284301758, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1840207576751709, "step": 5599 }, { "epoch": 0.0836201554438961, "grad_norm": 0.45703125, "grad_norm_var": 0.0026646931966145832, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.6420488357543945, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18119549751281738, "step": 5600 }, { "epoch": 0.08363508761451108, "grad_norm": 0.376953125, "grad_norm_var": 0.002645111083984375, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.70492684841156, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20340228080749512, "step": 5601 }, { "epoch": 0.08365001978512607, "grad_norm": 0.419921875, "grad_norm_var": 0.0018338362375895183, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.3815414905548096, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2006739303469658, "step": 5602 }, { "epoch": 0.08366495195574104, "grad_norm": 0.419921875, "grad_norm_var": 0.0018999735514322917, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.7681710720062256, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1641220971941948, "step": 5603 }, { "epoch": 0.08367988412635603, "grad_norm": 0.419921875, "grad_norm_var": 0.0019530614217122396, "learning_rate": 0.0001, "loss": 1.6824, "loss/crossentropy": 2.5578426122665405, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.23314543068408966, "step": 5604 }, { "epoch": 0.08369481629697101, "grad_norm": 0.326171875, "grad_norm_var": 0.0021982828776041666, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.38584566116333, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18606092780828476, "step": 5605 }, { "epoch": 0.08370974846758598, "grad_norm": 0.98828125, "grad_norm_var": 0.02484308878580729, "learning_rate": 0.0001, "loss": 1.889, "loss/crossentropy": 2.932942509651184, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.35386358201503754, "step": 5606 }, { "epoch": 0.08372468063820097, "grad_norm": 0.41015625, "grad_norm_var": 0.024837684631347657, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.6533130407333374, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20299965143203735, "step": 5607 }, { "epoch": 0.08373961280881595, "grad_norm": 0.341796875, "grad_norm_var": 0.025182072321573892, "learning_rate": 0.0001, "loss": 1.3212, "loss/crossentropy": 2.5547986030578613, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.15319480001926422, "step": 5608 }, { "epoch": 0.08375454497943094, "grad_norm": 0.353515625, "grad_norm_var": 0.025337664286295573, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.5853570699691772, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1849393993616104, "step": 5609 }, { "epoch": 0.08376947715004592, "grad_norm": 0.35546875, "grad_norm_var": 0.02527510325113932, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.5622044801712036, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17704448103904724, "step": 5610 }, { "epoch": 0.0837844093206609, "grad_norm": 0.337890625, "grad_norm_var": 0.02541192372639974, "learning_rate": 0.0001, "loss": 1.5014, "loss/crossentropy": 2.4192891120910645, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19671819359064102, "step": 5611 }, { "epoch": 0.08379934149127588, "grad_norm": 0.34375, "grad_norm_var": 0.02561338742574056, "learning_rate": 0.0001, "loss": 1.4523, "loss/crossentropy": 2.5167545080184937, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19451014697551727, "step": 5612 }, { "epoch": 0.08381427366189086, "grad_norm": 0.515625, "grad_norm_var": 0.02568650245666504, "learning_rate": 0.0001, "loss": 1.6429, "loss/crossentropy": 2.507842540740967, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2093006819486618, "step": 5613 }, { "epoch": 0.08382920583250585, "grad_norm": 0.408203125, "grad_norm_var": 0.025142860412597657, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 2.688083291053772, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18061483651399612, "step": 5614 }, { "epoch": 0.08384413800312082, "grad_norm": 0.375, "grad_norm_var": 0.024838876724243165, "learning_rate": 0.0001, "loss": 1.6176, "loss/crossentropy": 2.521301031112671, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.22694971412420273, "step": 5615 }, { "epoch": 0.08385907017373581, "grad_norm": 0.361328125, "grad_norm_var": 0.025042152404785155, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.653631329536438, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.17582793533802032, "step": 5616 }, { "epoch": 0.08387400234435079, "grad_norm": 0.392578125, "grad_norm_var": 0.02496331532796224, "learning_rate": 0.0001, "loss": 1.6293, "loss/crossentropy": 2.650401473045349, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.2347649335861206, "step": 5617 }, { "epoch": 0.08388893451496576, "grad_norm": 0.37109375, "grad_norm_var": 0.02513298988342285, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.388360023498535, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18836647272109985, "step": 5618 }, { "epoch": 0.08390386668558075, "grad_norm": 0.400390625, "grad_norm_var": 0.02515714963277181, "learning_rate": 0.0001, "loss": 1.5365, "loss/crossentropy": 2.6296521425247192, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20057462900877, "step": 5619 }, { "epoch": 0.08391879885619573, "grad_norm": 0.318359375, "grad_norm_var": 0.025786956151326496, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.5744388103485107, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1665259152650833, "step": 5620 }, { "epoch": 0.08393373102681072, "grad_norm": 0.359375, "grad_norm_var": 0.025473785400390626, "learning_rate": 0.0001, "loss": 1.5629, "loss/crossentropy": 2.579394578933716, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.234799824655056, "step": 5621 }, { "epoch": 0.0839486631974257, "grad_norm": 0.4453125, "grad_norm_var": 0.002364031473795573, "learning_rate": 0.0001, "loss": 1.6657, "loss/crossentropy": 2.6433463096618652, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.2398870661854744, "step": 5622 }, { "epoch": 0.08396359536804067, "grad_norm": 0.376953125, "grad_norm_var": 0.002302153905232747, "learning_rate": 0.0001, "loss": 1.5709, "loss/crossentropy": 2.768515467643738, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21931955218315125, "step": 5623 }, { "epoch": 0.08397852753865566, "grad_norm": 0.515625, "grad_norm_var": 0.003339068094889323, "learning_rate": 0.0001, "loss": 1.6757, "loss/crossentropy": 2.6639211177825928, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.2382015883922577, "step": 5624 }, { "epoch": 0.08399345970927063, "grad_norm": 0.58984375, "grad_norm_var": 0.005698887507120768, "learning_rate": 0.0001, "loss": 1.6313, "loss/crossentropy": 2.495992064476013, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.24850402772426605, "step": 5625 }, { "epoch": 0.08400839187988562, "grad_norm": 0.353515625, "grad_norm_var": 0.005711809794108073, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.6774871349334717, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19294799119234085, "step": 5626 }, { "epoch": 0.0840233240505006, "grad_norm": 0.43359375, "grad_norm_var": 0.0054399967193603516, "learning_rate": 0.0001, "loss": 1.5896, "loss/crossentropy": 2.476211905479431, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.19894296675920486, "step": 5627 }, { "epoch": 0.08403825622111558, "grad_norm": 0.3671875, "grad_norm_var": 0.005267190933227539, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.8697367906570435, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19999537616968155, "step": 5628 }, { "epoch": 0.08405318839173057, "grad_norm": 0.37890625, "grad_norm_var": 0.004537312189737955, "learning_rate": 0.0001, "loss": 1.5157, "loss/crossentropy": 2.6211864948272705, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19926973432302475, "step": 5629 }, { "epoch": 0.08406812056234554, "grad_norm": 0.357421875, "grad_norm_var": 0.004662942886352539, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.524079442024231, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17009639739990234, "step": 5630 }, { "epoch": 0.08408305273296053, "grad_norm": 0.380859375, "grad_norm_var": 0.004645729064941406, "learning_rate": 0.0001, "loss": 1.5885, "loss/crossentropy": 2.4947909116744995, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22912071645259857, "step": 5631 }, { "epoch": 0.0840979849035755, "grad_norm": 0.36328125, "grad_norm_var": 0.004635858535766602, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.68558931350708, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1814335733652115, "step": 5632 }, { "epoch": 0.0841129170741905, "grad_norm": 0.384765625, "grad_norm_var": 0.004647684097290039, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.4673837423324585, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.19460556656122208, "step": 5633 }, { "epoch": 0.08412784924480547, "grad_norm": 0.375, "grad_norm_var": 0.004633696873982748, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.6977070569992065, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21755839884281158, "step": 5634 }, { "epoch": 0.08414278141542045, "grad_norm": 0.35546875, "grad_norm_var": 0.00475762685139974, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.781487226486206, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1921514868736267, "step": 5635 }, { "epoch": 0.08415771358603544, "grad_norm": 0.5, "grad_norm_var": 0.004909880956013997, "learning_rate": 0.0001, "loss": 1.469, "loss/crossentropy": 2.8422375917434692, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.2073005586862564, "step": 5636 }, { "epoch": 0.08417264575665041, "grad_norm": 0.365234375, "grad_norm_var": 0.004873593648274739, "learning_rate": 0.0001, "loss": 1.5577, "loss/crossentropy": 2.5936710834503174, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2139626368880272, "step": 5637 }, { "epoch": 0.0841875779272654, "grad_norm": 0.3828125, "grad_norm_var": 0.0048145929972330725, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.810150980949402, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20513170212507248, "step": 5638 }, { "epoch": 0.08420251009788038, "grad_norm": 0.37109375, "grad_norm_var": 0.00483867327372233, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.6709606647491455, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19057705998420715, "step": 5639 }, { "epoch": 0.08421744226849535, "grad_norm": 0.3828125, "grad_norm_var": 0.003976170221964518, "learning_rate": 0.0001, "loss": 1.572, "loss/crossentropy": 2.5146050453186035, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.19695378839969635, "step": 5640 }, { "epoch": 0.08423237443911034, "grad_norm": 0.369140625, "grad_norm_var": 0.0013269424438476563, "learning_rate": 0.0001, "loss": 1.5132, "loss/crossentropy": 2.6804720163345337, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2084779143333435, "step": 5641 }, { "epoch": 0.08424730660972532, "grad_norm": 0.37890625, "grad_norm_var": 0.0012688795725504557, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.3544336557388306, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.20534543693065643, "step": 5642 }, { "epoch": 0.08426223878034031, "grad_norm": 0.416015625, "grad_norm_var": 0.001172320048014323, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.8009756803512573, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.205250583589077, "step": 5643 }, { "epoch": 0.08427717095095529, "grad_norm": 0.32421875, "grad_norm_var": 0.001378631591796875, "learning_rate": 0.0001, "loss": 1.548, "loss/crossentropy": 2.6542139053344727, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2120852917432785, "step": 5644 }, { "epoch": 0.08429210312157026, "grad_norm": 0.380859375, "grad_norm_var": 0.001378488540649414, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.6291604042053223, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2104644626379013, "step": 5645 }, { "epoch": 0.08430703529218525, "grad_norm": 0.33203125, "grad_norm_var": 0.00149688720703125, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.5103936195373535, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16146105527877808, "step": 5646 }, { "epoch": 0.08432196746280023, "grad_norm": 0.40625, "grad_norm_var": 0.0015437920888264974, "learning_rate": 0.0001, "loss": 1.4867, "loss/crossentropy": 2.408492922782898, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1898195743560791, "step": 5647 }, { "epoch": 0.08433689963341522, "grad_norm": 0.349609375, "grad_norm_var": 0.0015868504842122396, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.8647974729537964, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19373154640197754, "step": 5648 }, { "epoch": 0.08435183180403019, "grad_norm": 0.357421875, "grad_norm_var": 0.0016148885091145833, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.5774141550064087, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.20452135056257248, "step": 5649 }, { "epoch": 0.08436676397464517, "grad_norm": 0.337890625, "grad_norm_var": 0.0017154534657796224, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.5229716300964355, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18709959089756012, "step": 5650 }, { "epoch": 0.08438169614526016, "grad_norm": 0.341796875, "grad_norm_var": 0.0017638524373372396, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.6408777236938477, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.2490696683526039, "step": 5651 }, { "epoch": 0.08439662831587513, "grad_norm": 0.423828125, "grad_norm_var": 0.0008544762929280599, "learning_rate": 0.0001, "loss": 1.5046, "loss/crossentropy": 2.468769669532776, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1843179166316986, "step": 5652 }, { "epoch": 0.08441156048649012, "grad_norm": 0.361328125, "grad_norm_var": 0.0008579095204671224, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.5182732343673706, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.2317866086959839, "step": 5653 }, { "epoch": 0.0844264926571051, "grad_norm": 0.328125, "grad_norm_var": 0.0009495894114176433, "learning_rate": 0.0001, "loss": 1.5606, "loss/crossentropy": 2.472420811653137, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.23251678049564362, "step": 5654 }, { "epoch": 0.08444142482772009, "grad_norm": 0.3125, "grad_norm_var": 0.0011269728342692058, "learning_rate": 0.0001, "loss": 1.3525, "loss/crossentropy": 2.608055591583252, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.17277668416500092, "step": 5655 }, { "epoch": 0.08445635699833506, "grad_norm": 0.361328125, "grad_norm_var": 0.0010981241861979167, "learning_rate": 0.0001, "loss": 1.5019, "loss/crossentropy": 2.4700111150741577, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19725699722766876, "step": 5656 }, { "epoch": 0.08447128916895004, "grad_norm": 0.388671875, "grad_norm_var": 0.0011423110961914062, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.5870054960250854, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20889145880937576, "step": 5657 }, { "epoch": 0.08448622133956503, "grad_norm": 0.40625, "grad_norm_var": 0.0012486775716145833, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.5593470335006714, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.20853295922279358, "step": 5658 }, { "epoch": 0.08450115351018, "grad_norm": 0.62890625, "grad_norm_var": 0.005550495783487956, "learning_rate": 0.0001, "loss": 1.7114, "loss/crossentropy": 2.3949259519577026, "loss/fcd": 1.46484375, "loss/idx": 10.0, "loss/logits": 0.24656803160905838, "step": 5659 }, { "epoch": 0.084516085680795, "grad_norm": 0.341796875, "grad_norm_var": 0.005444780985514323, "learning_rate": 0.0001, "loss": 1.5262, "loss/crossentropy": 2.437016487121582, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.1941913440823555, "step": 5660 }, { "epoch": 0.08453101785140997, "grad_norm": 0.3203125, "grad_norm_var": 0.0056561628977457685, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.260611653327942, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.15302611887454987, "step": 5661 }, { "epoch": 0.08454595002202495, "grad_norm": 0.33203125, "grad_norm_var": 0.0056561628977457685, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.616440773010254, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19814449548721313, "step": 5662 }, { "epoch": 0.08456088219263994, "grad_norm": 0.37109375, "grad_norm_var": 0.00558635393778483, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6307249069213867, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.22180438786745071, "step": 5663 }, { "epoch": 0.08457581436325491, "grad_norm": 0.35546875, "grad_norm_var": 0.005570475260416667, "learning_rate": 0.0001, "loss": 1.389, "loss/crossentropy": 2.7007678747177124, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.18193482607603073, "step": 5664 }, { "epoch": 0.0845907465338699, "grad_norm": 0.380859375, "grad_norm_var": 0.005555979410807292, "learning_rate": 0.0001, "loss": 1.5123, "loss/crossentropy": 2.462680697441101, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.18419751524925232, "step": 5665 }, { "epoch": 0.08460567870448488, "grad_norm": 0.341796875, "grad_norm_var": 0.005537859598795573, "learning_rate": 0.0001, "loss": 1.4752, "loss/crossentropy": 2.5630385875701904, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.20561926066875458, "step": 5666 }, { "epoch": 0.08462061087509985, "grad_norm": 0.345703125, "grad_norm_var": 0.005521647135416667, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.7204813957214355, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1826217770576477, "step": 5667 }, { "epoch": 0.08463554304571484, "grad_norm": 0.3515625, "grad_norm_var": 0.005377562840779623, "learning_rate": 0.0001, "loss": 1.512, "loss/crossentropy": 2.3252744674682617, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.21122957020998, "step": 5668 }, { "epoch": 0.08465047521632982, "grad_norm": 0.349609375, "grad_norm_var": 0.005400451024373373, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.2224299907684326, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20131048560142517, "step": 5669 }, { "epoch": 0.08466540738694481, "grad_norm": 0.412109375, "grad_norm_var": 0.0053751627604166664, "learning_rate": 0.0001, "loss": 1.4195, "loss/crossentropy": 2.488065004348755, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1734003722667694, "step": 5670 }, { "epoch": 0.08468033955755978, "grad_norm": 0.41796875, "grad_norm_var": 0.005191485087076823, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.529186725616455, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20284470170736313, "step": 5671 }, { "epoch": 0.08469527172817477, "grad_norm": 0.337890625, "grad_norm_var": 0.005289141337076823, "learning_rate": 0.0001, "loss": 1.5834, "loss/crossentropy": 2.5661723613739014, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.22016824036836624, "step": 5672 }, { "epoch": 0.08471020389878975, "grad_norm": 0.39453125, "grad_norm_var": 0.005297962824503581, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.6941224336624146, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20218335092067719, "step": 5673 }, { "epoch": 0.08472513606940472, "grad_norm": 0.349609375, "grad_norm_var": 0.005303955078125, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.653908848762512, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1764025315642357, "step": 5674 }, { "epoch": 0.08474006824001971, "grad_norm": 0.390625, "grad_norm_var": 0.0008478164672851562, "learning_rate": 0.0001, "loss": 1.4569, "loss/crossentropy": 2.778231620788574, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17954035103321075, "step": 5675 }, { "epoch": 0.08475500041063469, "grad_norm": 0.33984375, "grad_norm_var": 0.0008533318837483724, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.440348267555237, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17881284654140472, "step": 5676 }, { "epoch": 0.08476993258124968, "grad_norm": 0.330078125, "grad_norm_var": 0.0008050918579101563, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.593424677848816, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17307250946760178, "step": 5677 }, { "epoch": 0.08478486475186466, "grad_norm": 0.486328125, "grad_norm_var": 0.0016652266184488931, "learning_rate": 0.0001, "loss": 1.6423, "loss/crossentropy": 2.6103683710098267, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2087191939353943, "step": 5678 }, { "epoch": 0.08479979692247963, "grad_norm": 0.376953125, "grad_norm_var": 0.0016665140787760417, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.5030394792556763, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20998401939868927, "step": 5679 }, { "epoch": 0.08481472909309462, "grad_norm": 0.37109375, "grad_norm_var": 0.0016461690266927084, "learning_rate": 0.0001, "loss": 1.5683, "loss/crossentropy": 2.6373448371887207, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2206905335187912, "step": 5680 }, { "epoch": 0.0848296612637096, "grad_norm": 0.453125, "grad_norm_var": 0.0020431359608968098, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.6398072242736816, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.19914818555116653, "step": 5681 }, { "epoch": 0.08484459343432459, "grad_norm": 0.4375, "grad_norm_var": 0.0021529515584309896, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.6239240169525146, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.2144039422273636, "step": 5682 }, { "epoch": 0.08485952560493956, "grad_norm": 0.40625, "grad_norm_var": 0.002072636286417643, "learning_rate": 0.0001, "loss": 1.6024, "loss/crossentropy": 2.4968961477279663, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2195436954498291, "step": 5683 }, { "epoch": 0.08487445777555454, "grad_norm": 0.43359375, "grad_norm_var": 0.002096668879191081, "learning_rate": 0.0001, "loss": 1.4879, "loss/crossentropy": 2.579380989074707, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.18318799883127213, "step": 5684 }, { "epoch": 0.08488938994616953, "grad_norm": 0.412109375, "grad_norm_var": 0.001979684829711914, "learning_rate": 0.0001, "loss": 1.5498, "loss/crossentropy": 2.8848432302474976, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21391094475984573, "step": 5685 }, { "epoch": 0.0849043221167845, "grad_norm": 0.96875, "grad_norm_var": 0.02247772216796875, "learning_rate": 0.0001, "loss": 2.1152, "loss/crossentropy": 2.6289011240005493, "loss/fcd": 1.76171875, "loss/idx": 10.0, "loss/logits": 0.35343722999095917, "step": 5686 }, { "epoch": 0.08491925428739949, "grad_norm": 0.423828125, "grad_norm_var": 0.022469186782836915, "learning_rate": 0.0001, "loss": 1.626, "loss/crossentropy": 2.7505728006362915, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.22758252173662186, "step": 5687 }, { "epoch": 0.08493418645801447, "grad_norm": 0.490234375, "grad_norm_var": 0.02200798988342285, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.4760303497314453, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.2035704329609871, "step": 5688 }, { "epoch": 0.08494911862862944, "grad_norm": 0.357421875, "grad_norm_var": 0.02232659657796224, "learning_rate": 0.0001, "loss": 1.5631, "loss/crossentropy": 2.619044303894043, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2389179691672325, "step": 5689 }, { "epoch": 0.08496405079924443, "grad_norm": 0.349609375, "grad_norm_var": 0.02232659657796224, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.5342867374420166, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17124644666910172, "step": 5690 }, { "epoch": 0.08497898296985941, "grad_norm": 0.3671875, "grad_norm_var": 0.02251275380452474, "learning_rate": 0.0001, "loss": 1.5156, "loss/crossentropy": 2.458285689353943, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.21479719132184982, "step": 5691 }, { "epoch": 0.0849939151404744, "grad_norm": 0.390625, "grad_norm_var": 0.02201105753580729, "learning_rate": 0.0001, "loss": 1.5682, "loss/crossentropy": 2.7162688970565796, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.20493563264608383, "step": 5692 }, { "epoch": 0.08500884731108937, "grad_norm": 0.447265625, "grad_norm_var": 0.021137491861979166, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.6200790405273438, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.19740477204322815, "step": 5693 }, { "epoch": 0.08502377948170436, "grad_norm": 0.392578125, "grad_norm_var": 0.021210734049479166, "learning_rate": 0.0001, "loss": 1.5627, "loss/crossentropy": 2.3402957916259766, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22283299267292023, "step": 5694 }, { "epoch": 0.08503871165231934, "grad_norm": 0.3515625, "grad_norm_var": 0.02147253354390462, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.3807982206344604, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2376427799463272, "step": 5695 }, { "epoch": 0.08505364382293432, "grad_norm": 0.33203125, "grad_norm_var": 0.021930932998657227, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.6678889989852905, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.1778111606836319, "step": 5696 }, { "epoch": 0.0850685759935493, "grad_norm": 0.31640625, "grad_norm_var": 0.02282992998758952, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.5372852087020874, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.1604379639029503, "step": 5697 }, { "epoch": 0.08508350816416428, "grad_norm": 0.36328125, "grad_norm_var": 0.023098103205362954, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.5359619855880737, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.18897782266139984, "step": 5698 }, { "epoch": 0.08509844033477927, "grad_norm": 0.359375, "grad_norm_var": 0.02335368792215983, "learning_rate": 0.0001, "loss": 1.5503, "loss/crossentropy": 2.540781855583191, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.21830499917268753, "step": 5699 }, { "epoch": 0.08511337250539425, "grad_norm": 0.453125, "grad_norm_var": 0.02340709368387858, "learning_rate": 0.0001, "loss": 1.6917, "loss/crossentropy": 2.813246011734009, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.23467838764190674, "step": 5700 }, { "epoch": 0.08512830467600922, "grad_norm": 0.48046875, "grad_norm_var": 0.023595682779947915, "learning_rate": 0.0001, "loss": 1.727, "loss/crossentropy": 2.6207462549209595, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.24648620188236237, "step": 5701 }, { "epoch": 0.08514323684662421, "grad_norm": 0.330078125, "grad_norm_var": 0.003018681208292643, "learning_rate": 0.0001, "loss": 1.4778, "loss/crossentropy": 2.6263726949691772, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.2004212737083435, "step": 5702 }, { "epoch": 0.08515816901723919, "grad_norm": 0.357421875, "grad_norm_var": 0.0029754479726155597, "learning_rate": 0.0001, "loss": 1.4958, "loss/crossentropy": 2.717457413673401, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19889996200799942, "step": 5703 }, { "epoch": 0.08517310118785418, "grad_norm": 0.361328125, "grad_norm_var": 0.0021823724110921223, "learning_rate": 0.0001, "loss": 1.6064, "loss/crossentropy": 2.7899258136749268, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.23919229209423065, "step": 5704 }, { "epoch": 0.08518803335846915, "grad_norm": 0.3515625, "grad_norm_var": 0.0021987279256184896, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.5830384492874146, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.18400347232818604, "step": 5705 }, { "epoch": 0.08520296552908413, "grad_norm": 0.3515625, "grad_norm_var": 0.0021922906239827474, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.682445526123047, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17812543362379074, "step": 5706 }, { "epoch": 0.08521789769969912, "grad_norm": 0.345703125, "grad_norm_var": 0.00224456787109375, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.5230987071990967, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1542539969086647, "step": 5707 }, { "epoch": 0.0852328298703141, "grad_norm": 0.37109375, "grad_norm_var": 0.002225176493326823, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.6687514781951904, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.17954403907060623, "step": 5708 }, { "epoch": 0.08524776204092908, "grad_norm": 0.3671875, "grad_norm_var": 0.001830911636352539, "learning_rate": 0.0001, "loss": 1.6297, "loss/crossentropy": 2.5040180683135986, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.23124796152114868, "step": 5709 }, { "epoch": 0.08526269421154406, "grad_norm": 0.380859375, "grad_norm_var": 0.0018007755279541016, "learning_rate": 0.0001, "loss": 1.5363, "loss/crossentropy": 2.5883944034576416, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.212078757584095, "step": 5710 }, { "epoch": 0.08527762638215904, "grad_norm": 0.33984375, "grad_norm_var": 0.0018335819244384766, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.5921714305877686, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2254517823457718, "step": 5711 }, { "epoch": 0.08529255855277403, "grad_norm": 0.310546875, "grad_norm_var": 0.0019606908162434896, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.56737744808197, "loss/fcd": 1.1640625, "loss/idx": 10.0, "loss/logits": 0.15960309654474258, "step": 5712 }, { "epoch": 0.085307490723389, "grad_norm": 0.318359375, "grad_norm_var": 0.0019482771555582683, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.605807662010193, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1765531599521637, "step": 5713 }, { "epoch": 0.08532242289400399, "grad_norm": 0.3203125, "grad_norm_var": 0.002074162165323893, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.434282422065735, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.22425071895122528, "step": 5714 }, { "epoch": 0.08533735506461897, "grad_norm": 0.333984375, "grad_norm_var": 0.002124786376953125, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.4302480220794678, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1800604686141014, "step": 5715 }, { "epoch": 0.08535228723523396, "grad_norm": 0.353515625, "grad_norm_var": 0.0015192508697509765, "learning_rate": 0.0001, "loss": 1.3469, "loss/crossentropy": 2.62381374835968, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16329501569271088, "step": 5716 }, { "epoch": 0.08536721940584893, "grad_norm": 0.388671875, "grad_norm_var": 0.000505510965983073, "learning_rate": 0.0001, "loss": 1.5218, "loss/crossentropy": 2.621532678604126, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19754647463560104, "step": 5717 }, { "epoch": 0.08538215157646391, "grad_norm": 0.337890625, "grad_norm_var": 0.0004897435506184896, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.743802309036255, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18123534321784973, "step": 5718 }, { "epoch": 0.0853970837470789, "grad_norm": 0.5078125, "grad_norm_var": 0.0020648797353108725, "learning_rate": 0.0001, "loss": 1.79, "loss/crossentropy": 2.6039748191833496, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.31736449897289276, "step": 5719 }, { "epoch": 0.08541201591769387, "grad_norm": 0.416015625, "grad_norm_var": 0.00227049191792806, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.504498600959778, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19487671554088593, "step": 5720 }, { "epoch": 0.08542694808830886, "grad_norm": 0.9296875, "grad_norm_var": 0.022341140111287437, "learning_rate": 0.0001, "loss": 1.8839, "loss/crossentropy": 2.479009985923767, "loss/fcd": 1.609375, "loss/idx": 10.0, "loss/logits": 0.27452707290649414, "step": 5721 }, { "epoch": 0.08544188025892384, "grad_norm": 0.46484375, "grad_norm_var": 0.022437016169230144, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.7543646097183228, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1677730828523636, "step": 5722 }, { "epoch": 0.08545681242953881, "grad_norm": 0.384765625, "grad_norm_var": 0.022221485773722332, "learning_rate": 0.0001, "loss": 1.5879, "loss/crossentropy": 2.7148630619049072, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.22067754715681076, "step": 5723 }, { "epoch": 0.0854717446001538, "grad_norm": 0.359375, "grad_norm_var": 0.02228748003641764, "learning_rate": 0.0001, "loss": 1.5143, "loss/crossentropy": 2.567071795463562, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20176279544830322, "step": 5724 }, { "epoch": 0.08548667677076878, "grad_norm": 0.392578125, "grad_norm_var": 0.022192637125651043, "learning_rate": 0.0001, "loss": 1.5345, "loss/crossentropy": 2.63234281539917, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20642179250717163, "step": 5725 }, { "epoch": 0.08550160894138377, "grad_norm": 0.388671875, "grad_norm_var": 0.022167460123697916, "learning_rate": 0.0001, "loss": 1.6017, "loss/crossentropy": 2.5836397409439087, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22674478590488434, "step": 5726 }, { "epoch": 0.08551654111199875, "grad_norm": 0.31640625, "grad_norm_var": 0.022418467203776042, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.4111900329589844, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.15703780204057693, "step": 5727 }, { "epoch": 0.08553147328261372, "grad_norm": 0.384765625, "grad_norm_var": 0.021801185607910157, "learning_rate": 0.0001, "loss": 1.6178, "loss/crossentropy": 2.537398099899292, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2193896472454071, "step": 5728 }, { "epoch": 0.08554640545322871, "grad_norm": 0.78125, "grad_norm_var": 0.029391717910766602, "learning_rate": 0.0001, "loss": 2.3179, "loss/crossentropy": 2.779664397239685, "loss/fcd": 1.84375, "loss/idx": 10.0, "loss/logits": 0.4741382449865341, "step": 5729 }, { "epoch": 0.08556133762384369, "grad_norm": 0.455078125, "grad_norm_var": 0.028353118896484376, "learning_rate": 0.0001, "loss": 1.8558, "loss/crossentropy": 2.5478415489196777, "loss/fcd": 1.58203125, "loss/idx": 10.0, "loss/logits": 0.27373111993074417, "step": 5730 }, { "epoch": 0.08557626979445868, "grad_norm": 0.55859375, "grad_norm_var": 0.028040552139282228, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.8402721881866455, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22818928956985474, "step": 5731 }, { "epoch": 0.08559120196507365, "grad_norm": 0.330078125, "grad_norm_var": 0.028419351577758788, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.8428711891174316, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17758063971996307, "step": 5732 }, { "epoch": 0.08560613413568864, "grad_norm": 0.3671875, "grad_norm_var": 0.0286590576171875, "learning_rate": 0.0001, "loss": 1.5058, "loss/crossentropy": 2.7082632780075073, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20506670325994492, "step": 5733 }, { "epoch": 0.08562106630630362, "grad_norm": 0.384765625, "grad_norm_var": 0.02802734375, "learning_rate": 0.0001, "loss": 1.4947, "loss/crossentropy": 2.5956482887268066, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19780191779136658, "step": 5734 }, { "epoch": 0.08563599847691859, "grad_norm": 0.34375, "grad_norm_var": 0.028748321533203124, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.430763006210327, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.16768718510866165, "step": 5735 }, { "epoch": 0.08565093064753358, "grad_norm": 0.357421875, "grad_norm_var": 0.029256629943847656, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.6687629222869873, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2037559077143669, "step": 5736 }, { "epoch": 0.08566586281814856, "grad_norm": 0.421875, "grad_norm_var": 0.012891578674316406, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.7926355600357056, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.19393512606620789, "step": 5737 }, { "epoch": 0.08568079498876355, "grad_norm": 0.431640625, "grad_norm_var": 0.012754042943318685, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.501401901245117, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20737963914871216, "step": 5738 }, { "epoch": 0.08569572715937852, "grad_norm": 0.39453125, "grad_norm_var": 0.012719154357910156, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.418579578399658, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.1818416640162468, "step": 5739 }, { "epoch": 0.0857106593299935, "grad_norm": 0.404296875, "grad_norm_var": 0.01250163714090983, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.375158429145813, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.2709405943751335, "step": 5740 }, { "epoch": 0.08572559150060849, "grad_norm": 0.357421875, "grad_norm_var": 0.012705341974894205, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.4552403688430786, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1951100081205368, "step": 5741 }, { "epoch": 0.08574052367122346, "grad_norm": 0.392578125, "grad_norm_var": 0.012691354751586914, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.4535902738571167, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2194589227437973, "step": 5742 }, { "epoch": 0.08575545584183845, "grad_norm": 0.365234375, "grad_norm_var": 0.012181536356608073, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.4615378379821777, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18972381204366684, "step": 5743 }, { "epoch": 0.08577038801245343, "grad_norm": 0.376953125, "grad_norm_var": 0.012222735087076823, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.567346215248108, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18750273436307907, "step": 5744 }, { "epoch": 0.0857853201830684, "grad_norm": 0.5234375, "grad_norm_var": 0.003964678446451823, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.4272565841674805, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.1919795423746109, "step": 5745 }, { "epoch": 0.0858002523536834, "grad_norm": 0.37109375, "grad_norm_var": 0.0038341363271077473, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.6248881816864014, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19456350058317184, "step": 5746 }, { "epoch": 0.08581518452429837, "grad_norm": 0.328125, "grad_norm_var": 0.002243661880493164, "learning_rate": 0.0001, "loss": 1.5684, "loss/crossentropy": 2.522754669189453, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.228519469499588, "step": 5747 }, { "epoch": 0.08583011669491336, "grad_norm": 0.32421875, "grad_norm_var": 0.0022882461547851563, "learning_rate": 0.0001, "loss": 1.5251, "loss/crossentropy": 2.5795847177505493, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21258530020713806, "step": 5748 }, { "epoch": 0.08584504886552834, "grad_norm": 0.640625, "grad_norm_var": 0.006347084045410156, "learning_rate": 0.0001, "loss": 1.6781, "loss/crossentropy": 3.3524285554885864, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.19369500130414963, "step": 5749 }, { "epoch": 0.08585998103614331, "grad_norm": 0.4296875, "grad_norm_var": 0.006375233332316081, "learning_rate": 0.0001, "loss": 1.5434, "loss/crossentropy": 2.676732301712036, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19572490453720093, "step": 5750 }, { "epoch": 0.0858749132067583, "grad_norm": 0.322265625, "grad_norm_var": 0.00657647450764974, "learning_rate": 0.0001, "loss": 1.4739, "loss/crossentropy": 2.681555390357971, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19657614827156067, "step": 5751 }, { "epoch": 0.08588984537737328, "grad_norm": 0.392578125, "grad_norm_var": 0.006442006429036458, "learning_rate": 0.0001, "loss": 1.6802, "loss/crossentropy": 2.4873546361923218, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.2192382663488388, "step": 5752 }, { "epoch": 0.08590477754798827, "grad_norm": 0.34375, "grad_norm_var": 0.0066454569498697914, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.53791344165802, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22845451533794403, "step": 5753 }, { "epoch": 0.08591970971860324, "grad_norm": 0.38671875, "grad_norm_var": 0.0065814812978108725, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 2.39043927192688, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22064237296581268, "step": 5754 }, { "epoch": 0.08593464188921823, "grad_norm": 0.357421875, "grad_norm_var": 0.00668023427327474, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.695526957511902, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17894461005926132, "step": 5755 }, { "epoch": 0.08594957405983321, "grad_norm": 0.341796875, "grad_norm_var": 0.00684502919514974, "learning_rate": 0.0001, "loss": 1.4523, "loss/crossentropy": 2.625945210456848, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18668626993894577, "step": 5756 }, { "epoch": 0.08596450623044818, "grad_norm": 0.380859375, "grad_norm_var": 0.00677483876546224, "learning_rate": 0.0001, "loss": 1.5668, "loss/crossentropy": 2.7096407413482666, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.1996341571211815, "step": 5757 }, { "epoch": 0.08597943840106317, "grad_norm": 0.365234375, "grad_norm_var": 0.0068206787109375, "learning_rate": 0.0001, "loss": 1.5995, "loss/crossentropy": 2.5632375478744507, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.21273377537727356, "step": 5758 }, { "epoch": 0.08599437057167815, "grad_norm": 0.41015625, "grad_norm_var": 0.00679472287495931, "learning_rate": 0.0001, "loss": 1.5516, "loss/crossentropy": 2.889216423034668, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21568378806114197, "step": 5759 }, { "epoch": 0.08600930274229314, "grad_norm": 0.33984375, "grad_norm_var": 0.006962331136067709, "learning_rate": 0.0001, "loss": 1.4472, "loss/crossentropy": 2.3322596549987793, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18155275285243988, "step": 5760 }, { "epoch": 0.08602423491290812, "grad_norm": 0.46484375, "grad_norm_var": 0.00614312489827474, "learning_rate": 0.0001, "loss": 1.545, "loss/crossentropy": 2.5115054845809937, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20515680313110352, "step": 5761 }, { "epoch": 0.08603916708352309, "grad_norm": 0.380859375, "grad_norm_var": 0.006127786636352539, "learning_rate": 0.0001, "loss": 1.6895, "loss/crossentropy": 2.4125293493270874, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.24419841170310974, "step": 5762 }, { "epoch": 0.08605409925413808, "grad_norm": 0.373046875, "grad_norm_var": 0.005894915262858073, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.4567710161209106, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19611342251300812, "step": 5763 }, { "epoch": 0.08606903142475306, "grad_norm": 0.431640625, "grad_norm_var": 0.005661503473917643, "learning_rate": 0.0001, "loss": 1.6612, "loss/crossentropy": 2.6212682723999023, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.23935863375663757, "step": 5764 }, { "epoch": 0.08608396359536805, "grad_norm": 0.439453125, "grad_norm_var": 0.0016717910766601562, "learning_rate": 0.0001, "loss": 1.584, "loss/crossentropy": 2.508147120475769, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.19732074439525604, "step": 5765 }, { "epoch": 0.08609889576598302, "grad_norm": 0.361328125, "grad_norm_var": 0.0015566349029541016, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.5114747285842896, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1703389286994934, "step": 5766 }, { "epoch": 0.086113827936598, "grad_norm": 0.419921875, "grad_norm_var": 0.0013913313547770183, "learning_rate": 0.0001, "loss": 1.7861, "loss/crossentropy": 2.3057245016098022, "loss/fcd": 1.5234375, "loss/idx": 10.0, "loss/logits": 0.26267875730991364, "step": 5767 }, { "epoch": 0.08612876010721299, "grad_norm": 0.3671875, "grad_norm_var": 0.001412200927734375, "learning_rate": 0.0001, "loss": 1.4928, "loss/crossentropy": 2.616989016532898, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1958777755498886, "step": 5768 }, { "epoch": 0.08614369227782796, "grad_norm": 0.349609375, "grad_norm_var": 0.0013819217681884765, "learning_rate": 0.0001, "loss": 1.5195, "loss/crossentropy": 2.6410118341445923, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20697301626205444, "step": 5769 }, { "epoch": 0.08615862444844295, "grad_norm": 0.412109375, "grad_norm_var": 0.001425933837890625, "learning_rate": 0.0001, "loss": 1.6833, "loss/crossentropy": 2.784039616584778, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.2458292469382286, "step": 5770 }, { "epoch": 0.08617355661905793, "grad_norm": 0.380859375, "grad_norm_var": 0.0013671875, "learning_rate": 0.0001, "loss": 1.5551, "loss/crossentropy": 2.6379687786102295, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2074815183877945, "step": 5771 }, { "epoch": 0.0861884887896729, "grad_norm": 0.333984375, "grad_norm_var": 0.001419830322265625, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.5201319456100464, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1936524584889412, "step": 5772 }, { "epoch": 0.0862034209602879, "grad_norm": 0.37890625, "grad_norm_var": 0.001421976089477539, "learning_rate": 0.0001, "loss": 1.389, "loss/crossentropy": 2.643931031227112, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16245780885219574, "step": 5773 }, { "epoch": 0.08621835313090287, "grad_norm": 0.431640625, "grad_norm_var": 0.0014954725901285806, "learning_rate": 0.0001, "loss": 1.8195, "loss/crossentropy": 2.516612410545349, "loss/fcd": 1.52734375, "loss/idx": 10.0, "loss/logits": 0.29214295744895935, "step": 5774 }, { "epoch": 0.08623328530151786, "grad_norm": 0.357421875, "grad_norm_var": 0.0015431086222330729, "learning_rate": 0.0001, "loss": 1.4124, "loss/crossentropy": 2.5098389387130737, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17803867906332016, "step": 5775 }, { "epoch": 0.08624821747213283, "grad_norm": 0.37890625, "grad_norm_var": 0.0013828913370768229, "learning_rate": 0.0001, "loss": 1.5515, "loss/crossentropy": 2.678349494934082, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20772965252399445, "step": 5776 }, { "epoch": 0.08626314964274782, "grad_norm": 0.4296875, "grad_norm_var": 0.0011156717936197916, "learning_rate": 0.0001, "loss": 1.5208, "loss/crossentropy": 2.5203497409820557, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19271857291460037, "step": 5777 }, { "epoch": 0.0862780818133628, "grad_norm": 0.40625, "grad_norm_var": 0.0011278629302978516, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.757035493850708, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.22423554956912994, "step": 5778 }, { "epoch": 0.08629301398397778, "grad_norm": 0.333984375, "grad_norm_var": 0.0013154188791910807, "learning_rate": 0.0001, "loss": 1.4913, "loss/crossentropy": 2.7590737342834473, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.21000518649816513, "step": 5779 }, { "epoch": 0.08630794615459277, "grad_norm": 0.435546875, "grad_norm_var": 0.001338942845662435, "learning_rate": 0.0001, "loss": 1.6045, "loss/crossentropy": 2.6048167943954468, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.2021472081542015, "step": 5780 }, { "epoch": 0.08632287832520774, "grad_norm": 0.4609375, "grad_norm_var": 0.0015136082967122397, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.717468023300171, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2197076678276062, "step": 5781 }, { "epoch": 0.08633781049582273, "grad_norm": 0.443359375, "grad_norm_var": 0.0016217549641927083, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.7957069873809814, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2098575457930565, "step": 5782 }, { "epoch": 0.0863527426664377, "grad_norm": 0.427734375, "grad_norm_var": 0.0016515096028645833, "learning_rate": 0.0001, "loss": 1.5085, "loss/crossentropy": 2.5493892431259155, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19990349560976028, "step": 5783 }, { "epoch": 0.08636767483705268, "grad_norm": 0.349609375, "grad_norm_var": 0.0017371972401936848, "learning_rate": 0.0001, "loss": 1.5159, "loss/crossentropy": 2.4853904247283936, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21117130666971207, "step": 5784 }, { "epoch": 0.08638260700766767, "grad_norm": 0.3984375, "grad_norm_var": 0.00159454345703125, "learning_rate": 0.0001, "loss": 1.577, "loss/crossentropy": 2.5425426959991455, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21760665625333786, "step": 5785 }, { "epoch": 0.08639753917828265, "grad_norm": 0.4140625, "grad_norm_var": 0.0015985965728759766, "learning_rate": 0.0001, "loss": 1.4339, "loss/crossentropy": 2.5793375968933105, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1682877093553543, "step": 5786 }, { "epoch": 0.08641247134889764, "grad_norm": 0.3828125, "grad_norm_var": 0.0015944798787434896, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.6038827896118164, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1810327246785164, "step": 5787 }, { "epoch": 0.08642740351951261, "grad_norm": 0.396484375, "grad_norm_var": 0.0013076146443684897, "learning_rate": 0.0001, "loss": 1.396, "loss/crossentropy": 2.6990219354629517, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1693926602602005, "step": 5788 }, { "epoch": 0.08644233569012759, "grad_norm": 0.376953125, "grad_norm_var": 0.0013137658437093098, "learning_rate": 0.0001, "loss": 1.4658, "loss/crossentropy": 2.5982290506362915, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18451900035142899, "step": 5789 }, { "epoch": 0.08645726786074258, "grad_norm": 0.353515625, "grad_norm_var": 0.0013811588287353516, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.70059335231781, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.2405213564634323, "step": 5790 }, { "epoch": 0.08647220003135755, "grad_norm": 0.361328125, "grad_norm_var": 0.001361703872680664, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5679848194122314, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17893312871456146, "step": 5791 }, { "epoch": 0.08648713220197254, "grad_norm": 0.388671875, "grad_norm_var": 0.00134429931640625, "learning_rate": 0.0001, "loss": 1.5078, "loss/crossentropy": 2.7096747159957886, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20314271748065948, "step": 5792 }, { "epoch": 0.08650206437258752, "grad_norm": 0.43359375, "grad_norm_var": 0.0013620376586914063, "learning_rate": 0.0001, "loss": 1.5454, "loss/crossentropy": 2.9023823738098145, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.2172812595963478, "step": 5793 }, { "epoch": 0.08651699654320251, "grad_norm": 0.365234375, "grad_norm_var": 0.001420450210571289, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.726659059524536, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18856608122587204, "step": 5794 }, { "epoch": 0.08653192871381749, "grad_norm": 0.345703125, "grad_norm_var": 0.001333475112915039, "learning_rate": 0.0001, "loss": 1.4563, "loss/crossentropy": 2.656598210334778, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18677915632724762, "step": 5795 }, { "epoch": 0.08654686088443246, "grad_norm": 0.34765625, "grad_norm_var": 0.0013513565063476562, "learning_rate": 0.0001, "loss": 1.4935, "loss/crossentropy": 2.856037139892578, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20448674261569977, "step": 5796 }, { "epoch": 0.08656179305504745, "grad_norm": 0.349609375, "grad_norm_var": 0.001078653335571289, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.399014472961426, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16831493377685547, "step": 5797 }, { "epoch": 0.08657672522566243, "grad_norm": 0.380859375, "grad_norm_var": 0.0008233229319254558, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.531559109687805, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1890270635485649, "step": 5798 }, { "epoch": 0.08659165739627742, "grad_norm": 0.375, "grad_norm_var": 0.000658098856608073, "learning_rate": 0.0001, "loss": 1.5637, "loss/crossentropy": 2.6783162355422974, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21218261122703552, "step": 5799 }, { "epoch": 0.08660658956689239, "grad_norm": 0.333984375, "grad_norm_var": 0.0007287979125976563, "learning_rate": 0.0001, "loss": 1.5691, "loss/crossentropy": 2.5383657217025757, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22141698747873306, "step": 5800 }, { "epoch": 0.08662152173750737, "grad_norm": 0.359375, "grad_norm_var": 0.0007033665974934896, "learning_rate": 0.0001, "loss": 1.7277, "loss/crossentropy": 2.3632242679595947, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.26675716042518616, "step": 5801 }, { "epoch": 0.08663645390812236, "grad_norm": 0.3359375, "grad_norm_var": 0.0006550470987955729, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.4070327281951904, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.15898023545742035, "step": 5802 }, { "epoch": 0.08665138607873733, "grad_norm": 0.333984375, "grad_norm_var": 0.0007071018218994141, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.537081718444824, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2084408923983574, "step": 5803 }, { "epoch": 0.08666631824935232, "grad_norm": 0.306640625, "grad_norm_var": 0.0008328596750895183, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.7086355686187744, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17287904024124146, "step": 5804 }, { "epoch": 0.0866812504199673, "grad_norm": 0.41015625, "grad_norm_var": 0.0009801228841145833, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.4436029195785522, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.197799913585186, "step": 5805 }, { "epoch": 0.08669618259058227, "grad_norm": 0.318359375, "grad_norm_var": 0.0010939915974934897, "learning_rate": 0.0001, "loss": 1.4494, "loss/crossentropy": 2.538907289505005, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19549265503883362, "step": 5806 }, { "epoch": 0.08671111476119726, "grad_norm": 0.37890625, "grad_norm_var": 0.0011184533437093099, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.344360828399658, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.23950839042663574, "step": 5807 }, { "epoch": 0.08672604693181224, "grad_norm": 0.375, "grad_norm_var": 0.0010782877604166667, "learning_rate": 0.0001, "loss": 1.6079, "loss/crossentropy": 2.7166266441345215, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22898957878351212, "step": 5808 }, { "epoch": 0.08674097910242723, "grad_norm": 0.431640625, "grad_norm_var": 0.0010591983795166016, "learning_rate": 0.0001, "loss": 1.8116, "loss/crossentropy": 2.4299365282058716, "loss/fcd": 1.53125, "loss/idx": 10.0, "loss/logits": 0.280388206243515, "step": 5809 }, { "epoch": 0.0867559112730422, "grad_norm": 0.337890625, "grad_norm_var": 0.0010841210683186849, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.576874017715454, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16000083088874817, "step": 5810 }, { "epoch": 0.08677084344365718, "grad_norm": 0.427734375, "grad_norm_var": 0.0013751824696858724, "learning_rate": 0.0001, "loss": 1.3957, "loss/crossentropy": 2.5010448694229126, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.16524122655391693, "step": 5811 }, { "epoch": 0.08678577561427217, "grad_norm": 0.3671875, "grad_norm_var": 0.0013599236806233724, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.58427631855011, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1921120584011078, "step": 5812 }, { "epoch": 0.08680070778488715, "grad_norm": 0.369140625, "grad_norm_var": 0.001346572240193685, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.693935990333557, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.20984956622123718, "step": 5813 }, { "epoch": 0.08681563995550214, "grad_norm": 0.42578125, "grad_norm_var": 0.0015670140584309897, "learning_rate": 0.0001, "loss": 1.6026, "loss/crossentropy": 2.4898252487182617, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21194706857204437, "step": 5814 }, { "epoch": 0.08683057212611711, "grad_norm": 0.33203125, "grad_norm_var": 0.001641845703125, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.4574761390686035, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1847347617149353, "step": 5815 }, { "epoch": 0.0868455042967321, "grad_norm": 0.384765625, "grad_norm_var": 0.0015914281209309896, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.6295565366744995, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21679292619228363, "step": 5816 }, { "epoch": 0.08686043646734708, "grad_norm": 0.34375, "grad_norm_var": 0.001625506083170573, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.696202278137207, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17981181293725967, "step": 5817 }, { "epoch": 0.08687536863796205, "grad_norm": 0.388671875, "grad_norm_var": 0.0015778700510660807, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.640082597732544, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21031814813613892, "step": 5818 }, { "epoch": 0.08689030080857704, "grad_norm": 0.333984375, "grad_norm_var": 0.0015778700510660807, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.70998215675354, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18611733615398407, "step": 5819 }, { "epoch": 0.08690523297919202, "grad_norm": 0.34375, "grad_norm_var": 0.0013468424479166667, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.6498098373413086, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1836862713098526, "step": 5820 }, { "epoch": 0.08692016514980701, "grad_norm": 0.39453125, "grad_norm_var": 0.0012847900390625, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.642079710960388, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19085106253623962, "step": 5821 }, { "epoch": 0.08693509732042198, "grad_norm": 0.474609375, "grad_norm_var": 0.0016916910807291666, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.749747395515442, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1939944252371788, "step": 5822 }, { "epoch": 0.08695002949103696, "grad_norm": 0.39453125, "grad_norm_var": 0.0017008463541666667, "learning_rate": 0.0001, "loss": 1.6939, "loss/crossentropy": 2.5759551525115967, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.2525184452533722, "step": 5823 }, { "epoch": 0.08696496166165195, "grad_norm": 0.392578125, "grad_norm_var": 0.0017018477121988932, "learning_rate": 0.0001, "loss": 1.4204, "loss/crossentropy": 2.44952392578125, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18214137852191925, "step": 5824 }, { "epoch": 0.08697989383226692, "grad_norm": 0.384765625, "grad_norm_var": 0.0015408674875895183, "learning_rate": 0.0001, "loss": 1.3829, "loss/crossentropy": 2.4516501426696777, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17198575288057327, "step": 5825 }, { "epoch": 0.08699482600288191, "grad_norm": 0.359375, "grad_norm_var": 0.0014462788899739583, "learning_rate": 0.0001, "loss": 1.3392, "loss/crossentropy": 2.5842173099517822, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.16339941322803497, "step": 5826 }, { "epoch": 0.08700975817349689, "grad_norm": 0.39453125, "grad_norm_var": 0.0013141473134358724, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.665157914161682, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21025575697422028, "step": 5827 }, { "epoch": 0.08702469034411187, "grad_norm": 0.396484375, "grad_norm_var": 0.0013167699178059896, "learning_rate": 0.0001, "loss": 1.6635, "loss/crossentropy": 2.8008904457092285, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.20650754868984222, "step": 5828 }, { "epoch": 0.08703962251472686, "grad_norm": 0.37890625, "grad_norm_var": 0.0013058821360270182, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.662898898124695, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.15333091467618942, "step": 5829 }, { "epoch": 0.08705455468534183, "grad_norm": 0.380859375, "grad_norm_var": 0.0011739095052083334, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.707575559616089, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19832369685173035, "step": 5830 }, { "epoch": 0.08706948685595682, "grad_norm": 0.361328125, "grad_norm_var": 0.00104063351949056, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.451979160308838, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22619488090276718, "step": 5831 }, { "epoch": 0.0870844190265718, "grad_norm": 0.4296875, "grad_norm_var": 0.0011850357055664062, "learning_rate": 0.0001, "loss": 1.6899, "loss/crossentropy": 3.0032734870910645, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.24845877289772034, "step": 5832 }, { "epoch": 0.08709935119718677, "grad_norm": 0.41796875, "grad_norm_var": 0.0011258443196614583, "learning_rate": 0.0001, "loss": 1.4787, "loss/crossentropy": 2.4336864948272705, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1818002685904503, "step": 5833 }, { "epoch": 0.08711428336780176, "grad_norm": 0.37109375, "grad_norm_var": 0.0011463006337483724, "learning_rate": 0.0001, "loss": 1.6814, "loss/crossentropy": 2.355494737625122, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.24000639468431473, "step": 5834 }, { "epoch": 0.08712921553841674, "grad_norm": 0.37890625, "grad_norm_var": 0.0009485244750976563, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.5775580406188965, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.16007475554943085, "step": 5835 }, { "epoch": 0.08714414770903173, "grad_norm": 0.375, "grad_norm_var": 0.0008132298787434896, "learning_rate": 0.0001, "loss": 1.7568, "loss/crossentropy": 2.648366928100586, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.2723776549100876, "step": 5836 }, { "epoch": 0.0871590798796467, "grad_norm": 0.38671875, "grad_norm_var": 0.0008152643839518229, "learning_rate": 0.0001, "loss": 1.5364, "loss/crossentropy": 2.868632435798645, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21607856452465057, "step": 5837 }, { "epoch": 0.08717401205026169, "grad_norm": 0.390625, "grad_norm_var": 0.0003347873687744141, "learning_rate": 0.0001, "loss": 1.3792, "loss/crossentropy": 3.0160642862319946, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.16437625885009766, "step": 5838 }, { "epoch": 0.08718894422087667, "grad_norm": 0.361328125, "grad_norm_var": 0.0003707249959309896, "learning_rate": 0.0001, "loss": 1.4262, "loss/crossentropy": 2.6654666662216187, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18402274698019028, "step": 5839 }, { "epoch": 0.08720387639149164, "grad_norm": 0.388671875, "grad_norm_var": 0.00036773681640625, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.7233105897903442, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1952771618962288, "step": 5840 }, { "epoch": 0.08721880856210663, "grad_norm": 0.34765625, "grad_norm_var": 0.0004538059234619141, "learning_rate": 0.0001, "loss": 1.5334, "loss/crossentropy": 2.433345317840576, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2091536447405815, "step": 5841 }, { "epoch": 0.08723374073272161, "grad_norm": 0.37890625, "grad_norm_var": 0.0004175662994384766, "learning_rate": 0.0001, "loss": 1.5655, "loss/crossentropy": 2.746241331100464, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22567030787467957, "step": 5842 }, { "epoch": 0.0872486729033366, "grad_norm": 0.359375, "grad_norm_var": 0.00044388771057128905, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.7615379095077515, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20504751801490784, "step": 5843 }, { "epoch": 0.08726360507395158, "grad_norm": 0.35546875, "grad_norm_var": 0.0004669189453125, "learning_rate": 0.0001, "loss": 1.5757, "loss/crossentropy": 2.6458569765090942, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22807341068983078, "step": 5844 }, { "epoch": 0.08727853724456655, "grad_norm": 0.349609375, "grad_norm_var": 0.0005205631256103516, "learning_rate": 0.0001, "loss": 1.5171, "loss/crossentropy": 2.5146602392196655, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20463445037603378, "step": 5845 }, { "epoch": 0.08729346941518154, "grad_norm": 0.380859375, "grad_norm_var": 0.0005205631256103516, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.5723944902420044, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20963767170906067, "step": 5846 }, { "epoch": 0.08730840158579652, "grad_norm": 0.314453125, "grad_norm_var": 0.0007563114166259765, "learning_rate": 0.0001, "loss": 1.4353, "loss/crossentropy": 2.558489680290222, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1892033815383911, "step": 5847 }, { "epoch": 0.0873233337564115, "grad_norm": 0.3828125, "grad_norm_var": 0.0005465030670166015, "learning_rate": 0.0001, "loss": 1.5951, "loss/crossentropy": 2.791352152824402, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.22794315963983536, "step": 5848 }, { "epoch": 0.08733826592702648, "grad_norm": 0.33984375, "grad_norm_var": 0.0004409631093343099, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.7380852699279785, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18826083838939667, "step": 5849 }, { "epoch": 0.08735319809764146, "grad_norm": 0.392578125, "grad_norm_var": 0.00048344930013020834, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.5688047409057617, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.18608660250902176, "step": 5850 }, { "epoch": 0.08736813026825645, "grad_norm": 0.39453125, "grad_norm_var": 0.0005221048990885417, "learning_rate": 0.0001, "loss": 1.5971, "loss/crossentropy": 2.6007808446884155, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2259777933359146, "step": 5851 }, { "epoch": 0.08738306243887142, "grad_norm": 0.380859375, "grad_norm_var": 0.0005292097727457682, "learning_rate": 0.0001, "loss": 1.5659, "loss/crossentropy": 2.6690810918807983, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21040938794612885, "step": 5852 }, { "epoch": 0.08739799460948641, "grad_norm": 0.318359375, "grad_norm_var": 0.000659942626953125, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.553123712539673, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.16853126138448715, "step": 5853 }, { "epoch": 0.08741292678010139, "grad_norm": 0.3203125, "grad_norm_var": 0.000726318359375, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.6135905981063843, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19510553777217865, "step": 5854 }, { "epoch": 0.08742785895071638, "grad_norm": 0.349609375, "grad_norm_var": 0.0007333755493164062, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.6665090322494507, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19352786242961884, "step": 5855 }, { "epoch": 0.08744279112133135, "grad_norm": 0.408203125, "grad_norm_var": 0.0008328755696614584, "learning_rate": 0.0001, "loss": 1.7515, "loss/crossentropy": 2.5150562524795532, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.274942547082901, "step": 5856 }, { "epoch": 0.08745772329194633, "grad_norm": 0.388671875, "grad_norm_var": 0.0008659203847249349, "learning_rate": 0.0001, "loss": 1.4346, "loss/crossentropy": 2.5296586751937866, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18460613489151, "step": 5857 }, { "epoch": 0.08747265546256132, "grad_norm": 0.400390625, "grad_norm_var": 0.000939178466796875, "learning_rate": 0.0001, "loss": 1.5358, "loss/crossentropy": 2.7336610555648804, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2193921133875847, "step": 5858 }, { "epoch": 0.0874875876331763, "grad_norm": 0.431640625, "grad_norm_var": 0.0012138207753499349, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.7309199571609497, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21075022965669632, "step": 5859 }, { "epoch": 0.08750251980379128, "grad_norm": 0.41015625, "grad_norm_var": 0.0013001600901285807, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.457706332206726, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.16414885222911835, "step": 5860 }, { "epoch": 0.08751745197440626, "grad_norm": 0.35546875, "grad_norm_var": 0.0012842814127604167, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.778298497200012, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17457310110330582, "step": 5861 }, { "epoch": 0.08753238414502124, "grad_norm": 0.361328125, "grad_norm_var": 0.0012877782185872395, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.575591564178467, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16292570531368256, "step": 5862 }, { "epoch": 0.08754731631563623, "grad_norm": 0.42578125, "grad_norm_var": 0.0012107690175374348, "learning_rate": 0.0001, "loss": 1.3533, "loss/crossentropy": 2.5458080768585205, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.15408024936914444, "step": 5863 }, { "epoch": 0.0875622484862512, "grad_norm": 0.3515625, "grad_norm_var": 0.001255019505818685, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.5595922470092773, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19947980344295502, "step": 5864 }, { "epoch": 0.08757718065686619, "grad_norm": 0.431640625, "grad_norm_var": 0.0013289769490559896, "learning_rate": 0.0001, "loss": 1.598, "loss/crossentropy": 2.8690444231033325, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.20733477175235748, "step": 5865 }, { "epoch": 0.08759211282748117, "grad_norm": 0.361328125, "grad_norm_var": 0.0013483047485351562, "learning_rate": 0.0001, "loss": 1.5136, "loss/crossentropy": 2.6273584365844727, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20112062245607376, "step": 5866 }, { "epoch": 0.08760704499809614, "grad_norm": 0.365234375, "grad_norm_var": 0.0013475894927978515, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.402259945869446, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1727115660905838, "step": 5867 }, { "epoch": 0.08762197716871113, "grad_norm": 0.435546875, "grad_norm_var": 0.0015496412913004558, "learning_rate": 0.0001, "loss": 1.5819, "loss/crossentropy": 2.648182988166809, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20694654434919357, "step": 5868 }, { "epoch": 0.08763690933932611, "grad_norm": 0.349609375, "grad_norm_var": 0.0013446648915608724, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.718559503555298, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19353727251291275, "step": 5869 }, { "epoch": 0.0876518415099411, "grad_norm": 0.337890625, "grad_norm_var": 0.0012143452962239583, "learning_rate": 0.0001, "loss": 1.5278, "loss/crossentropy": 2.498771548271179, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.22311966121196747, "step": 5870 }, { "epoch": 0.08766677368055607, "grad_norm": 0.353515625, "grad_norm_var": 0.0011967341105143228, "learning_rate": 0.0001, "loss": 1.4554, "loss/crossentropy": 2.627473831176758, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19763225317001343, "step": 5871 }, { "epoch": 0.08768170585117105, "grad_norm": 0.365234375, "grad_norm_var": 0.0011820475260416667, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.7405651807785034, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.16786612570285797, "step": 5872 }, { "epoch": 0.08769663802178604, "grad_norm": 0.380859375, "grad_norm_var": 0.0011797587076822916, "learning_rate": 0.0001, "loss": 1.4633, "loss/crossentropy": 2.433356523513794, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18594570457935333, "step": 5873 }, { "epoch": 0.08771157019240101, "grad_norm": 0.392578125, "grad_norm_var": 0.0011647542317708333, "learning_rate": 0.0001, "loss": 1.4887, "loss/crossentropy": 2.8374948501586914, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1995881274342537, "step": 5874 }, { "epoch": 0.087726502363016, "grad_norm": 0.3515625, "grad_norm_var": 0.001033767064412435, "learning_rate": 0.0001, "loss": 1.485, "loss/crossentropy": 2.749606132507324, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19597094506025314, "step": 5875 }, { "epoch": 0.08774143453363098, "grad_norm": 0.390625, "grad_norm_var": 0.0009708245595296224, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.67609441280365, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19024036824703217, "step": 5876 }, { "epoch": 0.08775636670424597, "grad_norm": 0.42578125, "grad_norm_var": 0.00109098752339681, "learning_rate": 0.0001, "loss": 1.5964, "loss/crossentropy": 2.6460975408554077, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2135498747229576, "step": 5877 }, { "epoch": 0.08777129887486095, "grad_norm": 0.3828125, "grad_norm_var": 0.0010663350423177084, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.621293067932129, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1853673979640007, "step": 5878 }, { "epoch": 0.08778623104547592, "grad_norm": 0.392578125, "grad_norm_var": 0.0009385267893473307, "learning_rate": 0.0001, "loss": 1.7208, "loss/crossentropy": 2.223662495613098, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.25203440338373184, "step": 5879 }, { "epoch": 0.08780116321609091, "grad_norm": 0.3359375, "grad_norm_var": 0.0010115146636962891, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.6820632219314575, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20088926702737808, "step": 5880 }, { "epoch": 0.08781609538670589, "grad_norm": 0.3984375, "grad_norm_var": 0.0008442560831705729, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.5433363914489746, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19284669309854507, "step": 5881 }, { "epoch": 0.08783102755732088, "grad_norm": 0.376953125, "grad_norm_var": 0.0008284886678059896, "learning_rate": 0.0001, "loss": 1.5552, "loss/crossentropy": 2.410026788711548, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.19976390898227692, "step": 5882 }, { "epoch": 0.08784595972793585, "grad_norm": 0.36328125, "grad_norm_var": 0.0008318424224853516, "learning_rate": 0.0001, "loss": 1.493, "loss/crossentropy": 2.46897554397583, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19610964506864548, "step": 5883 }, { "epoch": 0.08786089189855083, "grad_norm": 0.326171875, "grad_norm_var": 0.0007268110911051432, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.7138389348983765, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1865176111459732, "step": 5884 }, { "epoch": 0.08787582406916582, "grad_norm": 0.326171875, "grad_norm_var": 0.0008256117502848308, "learning_rate": 0.0001, "loss": 1.4745, "loss/crossentropy": 2.5515546798706055, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19720414280891418, "step": 5885 }, { "epoch": 0.08789075623978079, "grad_norm": 0.404296875, "grad_norm_var": 0.0008277734120686849, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.334363102912903, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19363503903150558, "step": 5886 }, { "epoch": 0.08790568841039578, "grad_norm": 0.34375, "grad_norm_var": 0.0008590062459309896, "learning_rate": 0.0001, "loss": 1.4102, "loss/crossentropy": 2.47863233089447, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1797613576054573, "step": 5887 }, { "epoch": 0.08792062058101076, "grad_norm": 0.3671875, "grad_norm_var": 0.000857400894165039, "learning_rate": 0.0001, "loss": 1.3469, "loss/crossentropy": 2.476583957672119, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.15935209393501282, "step": 5888 }, { "epoch": 0.08793555275162573, "grad_norm": 0.375, "grad_norm_var": 0.00085296630859375, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.7936620712280273, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21477127075195312, "step": 5889 }, { "epoch": 0.08795048492224072, "grad_norm": 0.39453125, "grad_norm_var": 0.0008585453033447266, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.527057647705078, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18150702118873596, "step": 5890 }, { "epoch": 0.0879654170928557, "grad_norm": 0.34375, "grad_norm_var": 0.0008838494618733724, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.456738233566284, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19198741763830185, "step": 5891 }, { "epoch": 0.08798034926347069, "grad_norm": 0.3984375, "grad_norm_var": 0.0009073734283447266, "learning_rate": 0.0001, "loss": 1.3877, "loss/crossentropy": 2.4731478691101074, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.16507112234830856, "step": 5892 }, { "epoch": 0.08799528143408566, "grad_norm": 0.3671875, "grad_norm_var": 0.0007032871246337891, "learning_rate": 0.0001, "loss": 1.3349, "loss/crossentropy": 2.4948081970214844, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.15522754937410355, "step": 5893 }, { "epoch": 0.08801021360470064, "grad_norm": 0.3671875, "grad_norm_var": 0.0006887912750244141, "learning_rate": 0.0001, "loss": 1.4941, "loss/crossentropy": 2.5968817472457886, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19721058011054993, "step": 5894 }, { "epoch": 0.08802514577531563, "grad_norm": 0.384765625, "grad_norm_var": 0.0006665388743082682, "learning_rate": 0.0001, "loss": 1.572, "loss/crossentropy": 2.3805805444717407, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.1931355744600296, "step": 5895 }, { "epoch": 0.0880400779459306, "grad_norm": 0.400390625, "grad_norm_var": 0.0006586710611979166, "learning_rate": 0.0001, "loss": 1.5388, "loss/crossentropy": 2.7892513275146484, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2302192598581314, "step": 5896 }, { "epoch": 0.0880550101165456, "grad_norm": 0.373046875, "grad_norm_var": 0.0006063938140869141, "learning_rate": 0.0001, "loss": 1.5577, "loss/crossentropy": 2.481887102127075, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.20614773780107498, "step": 5897 }, { "epoch": 0.08806994228716057, "grad_norm": 0.37890625, "grad_norm_var": 0.0006085713704427083, "learning_rate": 0.0001, "loss": 1.4326, "loss/crossentropy": 2.7628233432769775, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1748274490237236, "step": 5898 }, { "epoch": 0.08808487445777556, "grad_norm": 0.466796875, "grad_norm_var": 0.0011906782786051432, "learning_rate": 0.0001, "loss": 1.6433, "loss/crossentropy": 2.5081233978271484, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.20579956471920013, "step": 5899 }, { "epoch": 0.08809980662839054, "grad_norm": 0.359375, "grad_norm_var": 0.0010385513305664062, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.5079076290130615, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.1998567059636116, "step": 5900 }, { "epoch": 0.08811473879900551, "grad_norm": 0.466796875, "grad_norm_var": 0.0012994766235351562, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.515405535697937, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.17087069153785706, "step": 5901 }, { "epoch": 0.0881296709696205, "grad_norm": 0.357421875, "grad_norm_var": 0.0013284683227539062, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.6452709436416626, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17952585220336914, "step": 5902 }, { "epoch": 0.08814460314023548, "grad_norm": 0.369140625, "grad_norm_var": 0.0012323856353759766, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.6035348176956177, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21715590357780457, "step": 5903 }, { "epoch": 0.08815953531085047, "grad_norm": 0.369140625, "grad_norm_var": 0.0012278238932291667, "learning_rate": 0.0001, "loss": 1.5758, "loss/crossentropy": 2.2424561977386475, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.2125551402568817, "step": 5904 }, { "epoch": 0.08817446748146544, "grad_norm": 0.36328125, "grad_norm_var": 0.0012531916300455729, "learning_rate": 0.0001, "loss": 1.5113, "loss/crossentropy": 2.602207660675049, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20274799317121506, "step": 5905 }, { "epoch": 0.08818939965208042, "grad_norm": 0.36328125, "grad_norm_var": 0.001274553934733073, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.5647398233413696, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20923157036304474, "step": 5906 }, { "epoch": 0.08820433182269541, "grad_norm": 0.33203125, "grad_norm_var": 0.0013445536295572916, "learning_rate": 0.0001, "loss": 1.4745, "loss/crossentropy": 2.539007306098938, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.2010260671377182, "step": 5907 }, { "epoch": 0.08821926399331038, "grad_norm": 0.37109375, "grad_norm_var": 0.001332537333170573, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.700190544128418, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.22489262372255325, "step": 5908 }, { "epoch": 0.08823419616392537, "grad_norm": 0.71875, "grad_norm_var": 0.008427874247233073, "learning_rate": 0.0001, "loss": 1.5613, "loss/crossentropy": 3.5226815938949585, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.15502911806106567, "step": 5909 }, { "epoch": 0.08824912833454035, "grad_norm": 0.373046875, "grad_norm_var": 0.008402363459269205, "learning_rate": 0.0001, "loss": 1.585, "loss/crossentropy": 2.492139458656311, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.21781882643699646, "step": 5910 }, { "epoch": 0.08826406050515533, "grad_norm": 0.390625, "grad_norm_var": 0.008390299479166667, "learning_rate": 0.0001, "loss": 1.6684, "loss/crossentropy": 2.7377853393554688, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.24262036383152008, "step": 5911 }, { "epoch": 0.08827899267577032, "grad_norm": 0.37890625, "grad_norm_var": 0.00842754046122233, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.5941301584243774, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21191222965717316, "step": 5912 }, { "epoch": 0.08829392484638529, "grad_norm": 0.35546875, "grad_norm_var": 0.008514658610026041, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.6959677934646606, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1873829960823059, "step": 5913 }, { "epoch": 0.08830885701700028, "grad_norm": 0.376953125, "grad_norm_var": 0.00852061907450358, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.6868001222610474, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.22887183725833893, "step": 5914 }, { "epoch": 0.08832378918761526, "grad_norm": 0.46875, "grad_norm_var": 0.008538055419921874, "learning_rate": 0.0001, "loss": 1.7142, "loss/crossentropy": 2.8155874013900757, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.25328561663627625, "step": 5915 }, { "epoch": 0.08833872135823025, "grad_norm": 0.32421875, "grad_norm_var": 0.008809852600097656, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.6020933389663696, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.18965394794940948, "step": 5916 }, { "epoch": 0.08835365352884522, "grad_norm": 0.4140625, "grad_norm_var": 0.008504724502563477, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.611222267150879, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.2128012403845787, "step": 5917 }, { "epoch": 0.0883685856994602, "grad_norm": 0.37109375, "grad_norm_var": 0.008447202046712239, "learning_rate": 0.0001, "loss": 1.481, "loss/crossentropy": 2.702362895011902, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1997726410627365, "step": 5918 }, { "epoch": 0.08838351787007519, "grad_norm": 0.357421875, "grad_norm_var": 0.008498128255208333, "learning_rate": 0.0001, "loss": 1.6062, "loss/crossentropy": 2.2824639081954956, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22733919322490692, "step": 5919 }, { "epoch": 0.08839845004069016, "grad_norm": 0.357421875, "grad_norm_var": 0.00854791005452474, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.65229594707489, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17425088584423065, "step": 5920 }, { "epoch": 0.08841338221130515, "grad_norm": 0.32421875, "grad_norm_var": 0.00880730946858724, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.629738211631775, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2110155001282692, "step": 5921 }, { "epoch": 0.08842831438192013, "grad_norm": 0.37109375, "grad_norm_var": 0.008780860900878906, "learning_rate": 0.0001, "loss": 1.4441, "loss/crossentropy": 2.820119261741638, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19023501127958298, "step": 5922 }, { "epoch": 0.0884432465525351, "grad_norm": 0.3671875, "grad_norm_var": 0.008573150634765625, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.640541911125183, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20055898278951645, "step": 5923 }, { "epoch": 0.0884581787231501, "grad_norm": 0.314453125, "grad_norm_var": 0.008954350153605144, "learning_rate": 0.0001, "loss": 1.4853, "loss/crossentropy": 2.6469569206237793, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1962595134973526, "step": 5924 }, { "epoch": 0.08847311089376507, "grad_norm": 0.369140625, "grad_norm_var": 0.0013379414876302084, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.6118494272232056, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1782153695821762, "step": 5925 }, { "epoch": 0.08848804306438006, "grad_norm": 0.435546875, "grad_norm_var": 0.001610565185546875, "learning_rate": 0.0001, "loss": 1.7113, "loss/crossentropy": 2.4782938957214355, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.23861250281333923, "step": 5926 }, { "epoch": 0.08850297523499503, "grad_norm": 0.96875, "grad_norm_var": 0.02381718953450521, "learning_rate": 0.0001, "loss": 2.2251, "loss/crossentropy": 2.4670222997665405, "loss/fcd": 1.828125, "loss/idx": 10.0, "loss/logits": 0.39701876044273376, "step": 5927 }, { "epoch": 0.08851790740561001, "grad_norm": 0.46875, "grad_norm_var": 0.02395318349202474, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.392128586769104, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.19358104467391968, "step": 5928 }, { "epoch": 0.088532839576225, "grad_norm": 0.4140625, "grad_norm_var": 0.023700459798177084, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.656103730201721, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1925540342926979, "step": 5929 }, { "epoch": 0.08854777174683998, "grad_norm": 0.404296875, "grad_norm_var": 0.023594093322753907, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.464789390563965, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19722777605056763, "step": 5930 }, { "epoch": 0.08856270391745497, "grad_norm": 0.357421875, "grad_norm_var": 0.023654794692993163, "learning_rate": 0.0001, "loss": 1.4534, "loss/crossentropy": 2.901877760887146, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19951699674129486, "step": 5931 }, { "epoch": 0.08857763608806994, "grad_norm": 0.361328125, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 1.4871, "loss/crossentropy": 2.564202666282654, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20583372563123703, "step": 5932 }, { "epoch": 0.08859256825868492, "grad_norm": 0.3984375, "grad_norm_var": 0.023317464192708335, "learning_rate": 0.0001, "loss": 1.6822, "loss/crossentropy": 2.67582905292511, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2681182101368904, "step": 5933 }, { "epoch": 0.0886075004292999, "grad_norm": 0.369140625, "grad_norm_var": 0.02332914670308431, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.9840288162231445, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.210999995470047, "step": 5934 }, { "epoch": 0.08862243259991488, "grad_norm": 0.375, "grad_norm_var": 0.023213704427083332, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.6596217155456543, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.19447427988052368, "step": 5935 }, { "epoch": 0.08863736477052987, "grad_norm": 0.421875, "grad_norm_var": 0.022969802220662434, "learning_rate": 0.0001, "loss": 1.7348, "loss/crossentropy": 2.874703884124756, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.2621202617883682, "step": 5936 }, { "epoch": 0.08865229694114485, "grad_norm": 0.400390625, "grad_norm_var": 0.022359212239583332, "learning_rate": 0.0001, "loss": 1.504, "loss/crossentropy": 2.62591814994812, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19537615776062012, "step": 5937 }, { "epoch": 0.08866722911175984, "grad_norm": 0.376953125, "grad_norm_var": 0.02231939633687337, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.8659260272979736, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1792599782347679, "step": 5938 }, { "epoch": 0.08868216128237481, "grad_norm": 0.357421875, "grad_norm_var": 0.022400856018066406, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.5412901639938354, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20971951633691788, "step": 5939 }, { "epoch": 0.08869709345298979, "grad_norm": 0.392578125, "grad_norm_var": 0.02163537343343099, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.6109578609466553, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1922624632716179, "step": 5940 }, { "epoch": 0.08871202562360478, "grad_norm": 0.34375, "grad_norm_var": 0.021879816055297853, "learning_rate": 0.0001, "loss": 1.4694, "loss/crossentropy": 2.474845290184021, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19987713545560837, "step": 5941 }, { "epoch": 0.08872695779421975, "grad_norm": 0.40234375, "grad_norm_var": 0.0219146728515625, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.7145360708236694, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2078186348080635, "step": 5942 }, { "epoch": 0.08874188996483474, "grad_norm": 0.34375, "grad_norm_var": 0.0010813395182291666, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.7009546756744385, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17451035976409912, "step": 5943 }, { "epoch": 0.08875682213544972, "grad_norm": 0.400390625, "grad_norm_var": 0.0006257216135660807, "learning_rate": 0.0001, "loss": 1.6835, "loss/crossentropy": 2.458385467529297, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.21082435548305511, "step": 5944 }, { "epoch": 0.0887717543060647, "grad_norm": 0.384765625, "grad_norm_var": 0.0005558649698893229, "learning_rate": 0.0001, "loss": 1.6805, "loss/crossentropy": 2.5001702308654785, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2469010055065155, "step": 5945 }, { "epoch": 0.08878668647667969, "grad_norm": 0.384765625, "grad_norm_var": 0.000518035888671875, "learning_rate": 0.0001, "loss": 1.4935, "loss/crossentropy": 2.65566086769104, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.22004888206720352, "step": 5946 }, { "epoch": 0.08880161864729466, "grad_norm": 0.3984375, "grad_norm_var": 0.0005030155181884766, "learning_rate": 0.0001, "loss": 1.3707, "loss/crossentropy": 2.6573861837387085, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17145368456840515, "step": 5947 }, { "epoch": 0.08881655081790965, "grad_norm": 0.3984375, "grad_norm_var": 0.0004870096842447917, "learning_rate": 0.0001, "loss": 1.5678, "loss/crossentropy": 2.5794482231140137, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.20846372842788696, "step": 5948 }, { "epoch": 0.08883148298852463, "grad_norm": 0.431640625, "grad_norm_var": 0.000618600845336914, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.831605911254883, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21549133956432343, "step": 5949 }, { "epoch": 0.0888464151591396, "grad_norm": 0.328125, "grad_norm_var": 0.00081787109375, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.468858242034912, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1911798119544983, "step": 5950 }, { "epoch": 0.08886134732975459, "grad_norm": 0.376953125, "grad_norm_var": 0.0008158206939697266, "learning_rate": 0.0001, "loss": 1.6349, "loss/crossentropy": 2.2885020971298218, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2208677977323532, "step": 5951 }, { "epoch": 0.08887627950036957, "grad_norm": 0.349609375, "grad_norm_var": 0.0007764180501302083, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.7424720525741577, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19523131102323532, "step": 5952 }, { "epoch": 0.08889121167098456, "grad_norm": 0.353515625, "grad_norm_var": 0.0007825215657552083, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.7814356088638306, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.17952273786067963, "step": 5953 }, { "epoch": 0.08890614384159953, "grad_norm": 0.34765625, "grad_norm_var": 0.0008342583974202474, "learning_rate": 0.0001, "loss": 1.4007, "loss/crossentropy": 2.4324878454208374, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17415361106395721, "step": 5954 }, { "epoch": 0.08892107601221451, "grad_norm": 0.392578125, "grad_norm_var": 0.0008308251698811848, "learning_rate": 0.0001, "loss": 1.6739, "loss/crossentropy": 2.392095446586609, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.2324596866965294, "step": 5955 }, { "epoch": 0.0889360081828295, "grad_norm": 0.40234375, "grad_norm_var": 0.0008572896321614583, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.612600088119507, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.23435799777507782, "step": 5956 }, { "epoch": 0.08895094035344447, "grad_norm": 0.396484375, "grad_norm_var": 0.0007942040761311849, "learning_rate": 0.0001, "loss": 1.6723, "loss/crossentropy": 2.299489736557007, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.23478594422340393, "step": 5957 }, { "epoch": 0.08896587252405946, "grad_norm": 0.33984375, "grad_norm_var": 0.0008582909901936849, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.6180907487869263, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20165319740772247, "step": 5958 }, { "epoch": 0.08898080469467444, "grad_norm": 0.5234375, "grad_norm_var": 0.0020836989084879556, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.4460089206695557, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.23850122094154358, "step": 5959 }, { "epoch": 0.08899573686528943, "grad_norm": 0.34375, "grad_norm_var": 0.0021910985310872396, "learning_rate": 0.0001, "loss": 1.349, "loss/crossentropy": 2.5074318647384644, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16539529711008072, "step": 5960 }, { "epoch": 0.0890106690359044, "grad_norm": 0.41015625, "grad_norm_var": 0.002232217788696289, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.4816508293151855, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22057944536209106, "step": 5961 }, { "epoch": 0.08902560120651938, "grad_norm": 0.365234375, "grad_norm_var": 0.002259556452433268, "learning_rate": 0.0001, "loss": 1.4939, "loss/crossentropy": 2.7245408296585083, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20089855790138245, "step": 5962 }, { "epoch": 0.08904053337713437, "grad_norm": 0.369140625, "grad_norm_var": 0.002260271708170573, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.6973825693130493, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19453661888837814, "step": 5963 }, { "epoch": 0.08905546554774935, "grad_norm": 0.466796875, "grad_norm_var": 0.0026925245920817057, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.727285623550415, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1805814728140831, "step": 5964 }, { "epoch": 0.08907039771836434, "grad_norm": 0.37890625, "grad_norm_var": 0.002554766337076823, "learning_rate": 0.0001, "loss": 1.4397, "loss/crossentropy": 2.7349058389663696, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.189689502120018, "step": 5965 }, { "epoch": 0.08908532988897931, "grad_norm": 0.380859375, "grad_norm_var": 0.0023354689280192056, "learning_rate": 0.0001, "loss": 1.4564, "loss/crossentropy": 2.559323310852051, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1751411259174347, "step": 5966 }, { "epoch": 0.08910026205959429, "grad_norm": 0.328125, "grad_norm_var": 0.002552032470703125, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.727842092514038, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1896510124206543, "step": 5967 }, { "epoch": 0.08911519423020928, "grad_norm": 0.33984375, "grad_norm_var": 0.00260313351949056, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.704896569252014, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21699320524930954, "step": 5968 }, { "epoch": 0.08913012640082425, "grad_norm": 0.361328125, "grad_norm_var": 0.002575540542602539, "learning_rate": 0.0001, "loss": 1.3851, "loss/crossentropy": 2.6355080604553223, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17415382713079453, "step": 5969 }, { "epoch": 0.08914505857143924, "grad_norm": 0.34375, "grad_norm_var": 0.00259550412495931, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.877719759941101, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16975004971027374, "step": 5970 }, { "epoch": 0.08915999074205422, "grad_norm": 0.447265625, "grad_norm_var": 0.002845621109008789, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.5208033323287964, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2106911614537239, "step": 5971 }, { "epoch": 0.0891749229126692, "grad_norm": 0.380859375, "grad_norm_var": 0.0028314590454101562, "learning_rate": 0.0001, "loss": 1.4689, "loss/crossentropy": 2.7316819429397583, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19939502328634262, "step": 5972 }, { "epoch": 0.08918985508328418, "grad_norm": 0.4140625, "grad_norm_var": 0.0028753757476806642, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.8061563968658447, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1889425292611122, "step": 5973 }, { "epoch": 0.08920478725389916, "grad_norm": 0.330078125, "grad_norm_var": 0.0029428482055664064, "learning_rate": 0.0001, "loss": 1.5187, "loss/crossentropy": 2.5318233966827393, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21399053931236267, "step": 5974 }, { "epoch": 0.08921971942451415, "grad_norm": 0.33984375, "grad_norm_var": 0.00169677734375, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.5954277515411377, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.1621674746274948, "step": 5975 }, { "epoch": 0.08923465159512912, "grad_norm": 0.37109375, "grad_norm_var": 0.0016295750935872396, "learning_rate": 0.0001, "loss": 1.3738, "loss/crossentropy": 2.609924077987671, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1745682656764984, "step": 5976 }, { "epoch": 0.0892495837657441, "grad_norm": 0.41015625, "grad_norm_var": 0.0016295750935872396, "learning_rate": 0.0001, "loss": 1.6478, "loss/crossentropy": 2.7321152687072754, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.23761563003063202, "step": 5977 }, { "epoch": 0.08926451593635909, "grad_norm": 0.376953125, "grad_norm_var": 0.0016202290852864584, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.6589653491973877, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20129582285881042, "step": 5978 }, { "epoch": 0.08927944810697407, "grad_norm": 0.353515625, "grad_norm_var": 0.0016527811686197916, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.414724349975586, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21987849473953247, "step": 5979 }, { "epoch": 0.08929438027758906, "grad_norm": 0.84765625, "grad_norm_var": 0.015305821100870769, "learning_rate": 0.0001, "loss": 1.7583, "loss/crossentropy": 2.410378098487854, "loss/fcd": 1.46484375, "loss/idx": 10.0, "loss/logits": 0.29348859935998917, "step": 5980 }, { "epoch": 0.08930931244820403, "grad_norm": 0.3984375, "grad_norm_var": 0.01527403195699056, "learning_rate": 0.0001, "loss": 1.6709, "loss/crossentropy": 2.5201176404953003, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.22949735820293427, "step": 5981 }, { "epoch": 0.08932424461881902, "grad_norm": 0.392578125, "grad_norm_var": 0.015250380833943684, "learning_rate": 0.0001, "loss": 1.5394, "loss/crossentropy": 2.6100491285324097, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2073644995689392, "step": 5982 }, { "epoch": 0.089339176789434, "grad_norm": 0.42578125, "grad_norm_var": 0.014881626764933268, "learning_rate": 0.0001, "loss": 1.7287, "loss/crossentropy": 2.4833961725234985, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.24037756025791168, "step": 5983 }, { "epoch": 0.08935410896004897, "grad_norm": 0.328125, "grad_norm_var": 0.014997212092081706, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.552529811859131, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18861643970012665, "step": 5984 }, { "epoch": 0.08936904113066396, "grad_norm": 0.390625, "grad_norm_var": 0.014870134989420573, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.619381904602051, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18171476572752, "step": 5985 }, { "epoch": 0.08938397330127894, "grad_norm": 0.5546875, "grad_norm_var": 0.015803972880045574, "learning_rate": 0.0001, "loss": 1.8996, "loss/crossentropy": 2.699922204017639, "loss/fcd": 1.578125, "loss/idx": 10.0, "loss/logits": 0.3214544579386711, "step": 5986 }, { "epoch": 0.08939890547189393, "grad_norm": 0.37109375, "grad_norm_var": 0.015916172663370767, "learning_rate": 0.0001, "loss": 1.5712, "loss/crossentropy": 2.5707424879074097, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2118256539106369, "step": 5987 }, { "epoch": 0.0894138376425089, "grad_norm": 0.41015625, "grad_norm_var": 0.015825335184733072, "learning_rate": 0.0001, "loss": 1.5123, "loss/crossentropy": 2.7733123302459717, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20373833179473877, "step": 5988 }, { "epoch": 0.08942876981312388, "grad_norm": 0.322265625, "grad_norm_var": 0.016420729955037437, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.549202084541321, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19683901965618134, "step": 5989 }, { "epoch": 0.08944370198373887, "grad_norm": 0.35546875, "grad_norm_var": 0.01617711385091146, "learning_rate": 0.0001, "loss": 1.4993, "loss/crossentropy": 2.621713876724243, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.1945803165435791, "step": 5990 }, { "epoch": 0.08945863415435384, "grad_norm": 0.431640625, "grad_norm_var": 0.015777444839477538, "learning_rate": 0.0001, "loss": 1.6847, "loss/crossentropy": 2.497760772705078, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.22372134029865265, "step": 5991 }, { "epoch": 0.08947356632496883, "grad_norm": 0.4765625, "grad_norm_var": 0.01576714515686035, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.7669135332107544, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17917509377002716, "step": 5992 }, { "epoch": 0.08948849849558381, "grad_norm": 0.34765625, "grad_norm_var": 0.01615878740946452, "learning_rate": 0.0001, "loss": 1.6407, "loss/crossentropy": 2.7148231267929077, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.25784264504909515, "step": 5993 }, { "epoch": 0.08950343066619879, "grad_norm": 0.3515625, "grad_norm_var": 0.016358184814453124, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.623978018760681, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18959136307239532, "step": 5994 }, { "epoch": 0.08951836283681378, "grad_norm": 0.4609375, "grad_norm_var": 0.01609330177307129, "learning_rate": 0.0001, "loss": 1.6877, "loss/crossentropy": 2.4268769025802612, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.24238358438014984, "step": 5995 }, { "epoch": 0.08953329500742875, "grad_norm": 0.390625, "grad_norm_var": 0.0036409854888916015, "learning_rate": 0.0001, "loss": 1.775, "loss/crossentropy": 2.6209572553634644, "loss/fcd": 1.51953125, "loss/idx": 10.0, "loss/logits": 0.25544989854097366, "step": 5996 }, { "epoch": 0.08954822717804374, "grad_norm": 0.35546875, "grad_norm_var": 0.0037682692209879557, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.63802969455719, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19983041286468506, "step": 5997 }, { "epoch": 0.08956315934865872, "grad_norm": 0.333984375, "grad_norm_var": 0.004023853937784831, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.4182231426239014, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1808399111032486, "step": 5998 }, { "epoch": 0.0895780915192737, "grad_norm": 0.41015625, "grad_norm_var": 0.003973245620727539, "learning_rate": 0.0001, "loss": 1.6681, "loss/crossentropy": 2.5062289237976074, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.23837802559137344, "step": 5999 }, { "epoch": 0.08959302368988868, "grad_norm": 0.30859375, "grad_norm_var": 0.004166523615519206, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.37389600276947, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21449406445026398, "step": 6000 }, { "epoch": 0.08960795586050366, "grad_norm": 0.330078125, "grad_norm_var": 0.004406483968098959, "learning_rate": 0.0001, "loss": 1.3256, "loss/crossentropy": 2.456379771232605, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1498226821422577, "step": 6001 }, { "epoch": 0.08962288803111865, "grad_norm": 0.384765625, "grad_norm_var": 0.0024387200673421225, "learning_rate": 0.0001, "loss": 1.6963, "loss/crossentropy": 2.579658269882202, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.25098244845867157, "step": 6002 }, { "epoch": 0.08963782020173362, "grad_norm": 0.328125, "grad_norm_var": 0.0025911808013916017, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.4448989629745483, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18036343157291412, "step": 6003 }, { "epoch": 0.08965275237234861, "grad_norm": 0.3515625, "grad_norm_var": 0.0025301456451416017, "learning_rate": 0.0001, "loss": 1.5223, "loss/crossentropy": 2.583523750305176, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20977721363306046, "step": 6004 }, { "epoch": 0.08966768454296359, "grad_norm": 0.40625, "grad_norm_var": 0.0024228413899739583, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.7701836824417114, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1803232729434967, "step": 6005 }, { "epoch": 0.08968261671357856, "grad_norm": 0.3671875, "grad_norm_var": 0.0023986180623372395, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.56534481048584, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19104423373937607, "step": 6006 }, { "epoch": 0.08969754888419355, "grad_norm": 0.53515625, "grad_norm_var": 0.003819767634073893, "learning_rate": 0.0001, "loss": 1.5345, "loss/crossentropy": 2.581760048866272, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20638290792703629, "step": 6007 }, { "epoch": 0.08971248105480853, "grad_norm": 0.349609375, "grad_norm_var": 0.0032546361287434894, "learning_rate": 0.0001, "loss": 1.3987, "loss/crossentropy": 2.6578365564346313, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17215529829263687, "step": 6008 }, { "epoch": 0.08972741322542352, "grad_norm": 0.333984375, "grad_norm_var": 0.0033174991607666016, "learning_rate": 0.0001, "loss": 1.406, "loss/crossentropy": 2.651191830635071, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1912032663822174, "step": 6009 }, { "epoch": 0.0897423453960385, "grad_norm": 0.37109375, "grad_norm_var": 0.0032806237538655597, "learning_rate": 0.0001, "loss": 1.4853, "loss/crossentropy": 2.6979880332946777, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20012254267930984, "step": 6010 }, { "epoch": 0.08975727756665347, "grad_norm": 0.3828125, "grad_norm_var": 0.002778355280558268, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.723617196083069, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21479582786560059, "step": 6011 }, { "epoch": 0.08977220973726846, "grad_norm": 0.3515625, "grad_norm_var": 0.0027726332346598307, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.6511118412017822, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1819654256105423, "step": 6012 }, { "epoch": 0.08978714190788344, "grad_norm": 0.41796875, "grad_norm_var": 0.0029058933258056642, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.304448366165161, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.2896904796361923, "step": 6013 }, { "epoch": 0.08980207407849843, "grad_norm": 0.376953125, "grad_norm_var": 0.0027995904286702475, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.5488473176956177, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19778752326965332, "step": 6014 }, { "epoch": 0.0898170062491134, "grad_norm": 0.431640625, "grad_norm_var": 0.0029280980428059897, "learning_rate": 0.0001, "loss": 1.625, "loss/crossentropy": 2.584078788757324, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21873314678668976, "step": 6015 }, { "epoch": 0.08983193841972838, "grad_norm": 0.373046875, "grad_norm_var": 0.002602370580037435, "learning_rate": 0.0001, "loss": 1.4257, "loss/crossentropy": 2.752264618873596, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18354015052318573, "step": 6016 }, { "epoch": 0.08984687059034337, "grad_norm": 0.322265625, "grad_norm_var": 0.002658955256144206, "learning_rate": 0.0001, "loss": 1.3112, "loss/crossentropy": 2.58688223361969, "loss/fcd": 1.15625, "loss/idx": 10.0, "loss/logits": 0.15493462979793549, "step": 6017 }, { "epoch": 0.08986180276095834, "grad_norm": 0.369140625, "grad_norm_var": 0.002664804458618164, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.748197913169861, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20509565621614456, "step": 6018 }, { "epoch": 0.08987673493157333, "grad_norm": 0.37890625, "grad_norm_var": 0.002479664484659831, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.6860464811325073, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17503127455711365, "step": 6019 }, { "epoch": 0.08989166710218831, "grad_norm": 0.3359375, "grad_norm_var": 0.0025592645009358725, "learning_rate": 0.0001, "loss": 1.5272, "loss/crossentropy": 2.4979690313339233, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19908242672681808, "step": 6020 }, { "epoch": 0.0899065992728033, "grad_norm": 0.412109375, "grad_norm_var": 0.0025807698567708332, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.6769349575042725, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20749130100011826, "step": 6021 }, { "epoch": 0.08992153144341827, "grad_norm": 0.35546875, "grad_norm_var": 0.0026122411092122395, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.450356602668762, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1719469577074051, "step": 6022 }, { "epoch": 0.08993646361403325, "grad_norm": 0.474609375, "grad_norm_var": 0.0015977064768473307, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.7023407220840454, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.24440427869558334, "step": 6023 }, { "epoch": 0.08995139578464824, "grad_norm": 0.39453125, "grad_norm_var": 0.0015578587849934896, "learning_rate": 0.0001, "loss": 1.7026, "loss/crossentropy": 2.344549059867859, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.24944236129522324, "step": 6024 }, { "epoch": 0.08996632795526321, "grad_norm": 0.359375, "grad_norm_var": 0.00144193967183431, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.773011326789856, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.1536191776394844, "step": 6025 }, { "epoch": 0.0899812601258782, "grad_norm": 0.39453125, "grad_norm_var": 0.0014430840810139975, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.4849932193756104, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.20043876022100449, "step": 6026 }, { "epoch": 0.08999619229649318, "grad_norm": 0.337890625, "grad_norm_var": 0.0015714009602864583, "learning_rate": 0.0001, "loss": 1.5278, "loss/crossentropy": 2.6739816665649414, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2075263112783432, "step": 6027 }, { "epoch": 0.09001112446710816, "grad_norm": 0.37109375, "grad_norm_var": 0.0015202204386393229, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.644996762275696, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18845635652542114, "step": 6028 }, { "epoch": 0.09002605663772315, "grad_norm": 0.416015625, "grad_norm_var": 0.0015109856923421225, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.92212450504303, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19806398451328278, "step": 6029 }, { "epoch": 0.09004098880833812, "grad_norm": 0.337890625, "grad_norm_var": 0.0016298770904541015, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.556459903717041, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1852339655160904, "step": 6030 }, { "epoch": 0.09005592097895311, "grad_norm": 0.423828125, "grad_norm_var": 0.0015788873036702473, "learning_rate": 0.0001, "loss": 1.5208, "loss/crossentropy": 2.6322938203811646, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19654358178377151, "step": 6031 }, { "epoch": 0.09007085314956809, "grad_norm": 0.376953125, "grad_norm_var": 0.0015769799550374348, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.5856072902679443, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20552971214056015, "step": 6032 }, { "epoch": 0.09008578532018306, "grad_norm": 0.376953125, "grad_norm_var": 0.001351785659790039, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.74534273147583, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.2089121788740158, "step": 6033 }, { "epoch": 0.09010071749079805, "grad_norm": 0.416015625, "grad_norm_var": 0.001407480239868164, "learning_rate": 0.0001, "loss": 1.5908, "loss/crossentropy": 2.6793121099472046, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21972574293613434, "step": 6034 }, { "epoch": 0.09011564966141303, "grad_norm": 0.341796875, "grad_norm_var": 0.00152435302734375, "learning_rate": 0.0001, "loss": 1.4169, "loss/crossentropy": 2.554524302482605, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17864274978637695, "step": 6035 }, { "epoch": 0.09013058183202802, "grad_norm": 0.5078125, "grad_norm_var": 0.00229644775390625, "learning_rate": 0.0001, "loss": 1.691, "loss/crossentropy": 2.2157055735588074, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.24954694509506226, "step": 6036 }, { "epoch": 0.09014551400264299, "grad_norm": 0.326171875, "grad_norm_var": 0.0025454203287760416, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.3213934898376465, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16856246441602707, "step": 6037 }, { "epoch": 0.09016044617325797, "grad_norm": 0.337890625, "grad_norm_var": 0.0026414076487223308, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.5015313625335693, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.19308914989233017, "step": 6038 }, { "epoch": 0.09017537834387296, "grad_norm": 0.373046875, "grad_norm_var": 0.002100865046183268, "learning_rate": 0.0001, "loss": 1.3884, "loss/crossentropy": 2.689542531967163, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17744171619415283, "step": 6039 }, { "epoch": 0.09019031051448793, "grad_norm": 0.380859375, "grad_norm_var": 0.00208740234375, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.5625566244125366, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18092302232980728, "step": 6040 }, { "epoch": 0.09020524268510292, "grad_norm": 0.35546875, "grad_norm_var": 0.0020990371704101562, "learning_rate": 0.0001, "loss": 1.5862, "loss/crossentropy": 2.559331774711609, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.23462235182523727, "step": 6041 }, { "epoch": 0.0902201748557179, "grad_norm": 0.353515625, "grad_norm_var": 0.0021227359771728515, "learning_rate": 0.0001, "loss": 1.5122, "loss/crossentropy": 2.654435396194458, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19580399990081787, "step": 6042 }, { "epoch": 0.09023510702633289, "grad_norm": 0.341796875, "grad_norm_var": 0.002103281021118164, "learning_rate": 0.0001, "loss": 1.4992, "loss/crossentropy": 2.630675435066223, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.21400655806064606, "step": 6043 }, { "epoch": 0.09025003919694786, "grad_norm": 0.353515625, "grad_norm_var": 0.0021371841430664062, "learning_rate": 0.0001, "loss": 1.5386, "loss/crossentropy": 2.6428234577178955, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2066163495182991, "step": 6044 }, { "epoch": 0.09026497136756284, "grad_norm": 0.38671875, "grad_norm_var": 0.002035379409790039, "learning_rate": 0.0001, "loss": 1.6431, "loss/crossentropy": 2.794923186302185, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.24079640954732895, "step": 6045 }, { "epoch": 0.09027990353817783, "grad_norm": 0.37890625, "grad_norm_var": 0.00194091796875, "learning_rate": 0.0001, "loss": 1.4893, "loss/crossentropy": 2.533490300178528, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.18072519451379776, "step": 6046 }, { "epoch": 0.0902948357087928, "grad_norm": 0.3984375, "grad_norm_var": 0.001822519302368164, "learning_rate": 0.0001, "loss": 1.7253, "loss/crossentropy": 2.5001789331436157, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.24869469553232193, "step": 6047 }, { "epoch": 0.0903097678794078, "grad_norm": 0.359375, "grad_norm_var": 0.0018381118774414063, "learning_rate": 0.0001, "loss": 1.5857, "loss/crossentropy": 2.4593697786331177, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21459520608186722, "step": 6048 }, { "epoch": 0.09032470005002277, "grad_norm": 0.326171875, "grad_norm_var": 0.0019810994466145835, "learning_rate": 0.0001, "loss": 1.3686, "loss/crossentropy": 2.1301968097686768, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.15767445415258408, "step": 6049 }, { "epoch": 0.09033963222063775, "grad_norm": 0.361328125, "grad_norm_var": 0.0018404642740885416, "learning_rate": 0.0001, "loss": 1.6686, "loss/crossentropy": 2.265449285507202, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2349918931722641, "step": 6050 }, { "epoch": 0.09035456439125274, "grad_norm": 0.34765625, "grad_norm_var": 0.0018223921457926433, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.7166242599487305, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1921098455786705, "step": 6051 }, { "epoch": 0.09036949656186771, "grad_norm": 0.330078125, "grad_norm_var": 0.000484466552734375, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.5112862586975098, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18434394896030426, "step": 6052 }, { "epoch": 0.0903844287324827, "grad_norm": 0.451171875, "grad_norm_var": 0.000948333740234375, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.7899521589279175, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.23660480231046677, "step": 6053 }, { "epoch": 0.09039936090309768, "grad_norm": 0.3984375, "grad_norm_var": 0.0009606520334879558, "learning_rate": 0.0001, "loss": 1.6062, "loss/crossentropy": 2.6340502500534058, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2233392894268036, "step": 6054 }, { "epoch": 0.09041429307371265, "grad_norm": 0.3828125, "grad_norm_var": 0.0009724934895833333, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.8182276487350464, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21427223086357117, "step": 6055 }, { "epoch": 0.09042922524432764, "grad_norm": 0.375, "grad_norm_var": 0.0009654839833577474, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.506082057952881, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.2013598382472992, "step": 6056 }, { "epoch": 0.09044415741494262, "grad_norm": 0.361328125, "grad_norm_var": 0.0009572347005208333, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.6257240772247314, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19601673632860184, "step": 6057 }, { "epoch": 0.09045908958555761, "grad_norm": 0.369140625, "grad_norm_var": 0.00093994140625, "learning_rate": 0.0001, "loss": 1.3799, "loss/crossentropy": 2.7797279357910156, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17679598182439804, "step": 6058 }, { "epoch": 0.09047402175617258, "grad_norm": 0.90625, "grad_norm_var": 0.018721501032511394, "learning_rate": 0.0001, "loss": 1.6453, "loss/crossentropy": 2.7503514289855957, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.25861669331789017, "step": 6059 }, { "epoch": 0.09048895392678757, "grad_norm": 0.341796875, "grad_norm_var": 0.018811146418253582, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.676846146583557, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16934241354465485, "step": 6060 }, { "epoch": 0.09050388609740255, "grad_norm": 0.4609375, "grad_norm_var": 0.018977848688761394, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.3082926273345947, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2288709357380867, "step": 6061 }, { "epoch": 0.09051881826801753, "grad_norm": 0.34765625, "grad_norm_var": 0.019165531794230143, "learning_rate": 0.0001, "loss": 1.4741, "loss/crossentropy": 2.5515336990356445, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.18890907615423203, "step": 6062 }, { "epoch": 0.09053375043863252, "grad_norm": 0.359375, "grad_norm_var": 0.019307311375935873, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.579632043838501, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20317605137825012, "step": 6063 }, { "epoch": 0.09054868260924749, "grad_norm": 0.369140625, "grad_norm_var": 0.019253985087076823, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.445357918739319, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.18934883922338486, "step": 6064 }, { "epoch": 0.09056361477986248, "grad_norm": 0.357421875, "grad_norm_var": 0.018984413146972655, "learning_rate": 0.0001, "loss": 1.5243, "loss/crossentropy": 2.6490715742111206, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19615552574396133, "step": 6065 }, { "epoch": 0.09057854695047746, "grad_norm": 0.34765625, "grad_norm_var": 0.019080209732055663, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.5956286191940308, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19154643267393112, "step": 6066 }, { "epoch": 0.09059347912109243, "grad_norm": 0.359375, "grad_norm_var": 0.018996667861938477, "learning_rate": 0.0001, "loss": 1.5792, "loss/crossentropy": 2.5702720880508423, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.23155538737773895, "step": 6067 }, { "epoch": 0.09060841129170742, "grad_norm": 0.376953125, "grad_norm_var": 0.01865105628967285, "learning_rate": 0.0001, "loss": 1.5493, "loss/crossentropy": 2.4950950145721436, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.18994250148534775, "step": 6068 }, { "epoch": 0.0906233434623224, "grad_norm": 0.36328125, "grad_norm_var": 0.018654632568359374, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.6161643266677856, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1892285794019699, "step": 6069 }, { "epoch": 0.09063827563293739, "grad_norm": 0.38671875, "grad_norm_var": 0.018673133850097657, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.7243738174438477, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19826939702033997, "step": 6070 }, { "epoch": 0.09065320780355236, "grad_norm": 0.333984375, "grad_norm_var": 0.01896042823791504, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.793265700340271, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1887715458869934, "step": 6071 }, { "epoch": 0.09066813997416734, "grad_norm": 0.390625, "grad_norm_var": 0.018921518325805665, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.3981525897979736, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.21811556816101074, "step": 6072 }, { "epoch": 0.09068307214478233, "grad_norm": 0.34375, "grad_norm_var": 0.019036102294921874, "learning_rate": 0.0001, "loss": 1.5606, "loss/crossentropy": 2.5297006368637085, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2246255800127983, "step": 6073 }, { "epoch": 0.0906980043153973, "grad_norm": 0.435546875, "grad_norm_var": 0.01903069814046224, "learning_rate": 0.0001, "loss": 1.6156, "loss/crossentropy": 2.6102782487869263, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.22107261419296265, "step": 6074 }, { "epoch": 0.0907129364860123, "grad_norm": 0.38671875, "grad_norm_var": 0.001180267333984375, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.692941427230835, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21160446852445602, "step": 6075 }, { "epoch": 0.09072786865662727, "grad_norm": 0.333984375, "grad_norm_var": 0.00121612548828125, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.565977692604065, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19628364592790604, "step": 6076 }, { "epoch": 0.09074280082724225, "grad_norm": 0.353515625, "grad_norm_var": 0.0006645043690999349, "learning_rate": 0.0001, "loss": 1.3622, "loss/crossentropy": 2.383897542953491, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.15512127429246902, "step": 6077 }, { "epoch": 0.09075773299785724, "grad_norm": 0.388671875, "grad_norm_var": 0.0006728490193684896, "learning_rate": 0.0001, "loss": 1.6101, "loss/crossentropy": 2.376320481300354, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21946093440055847, "step": 6078 }, { "epoch": 0.09077266516847221, "grad_norm": 0.37109375, "grad_norm_var": 0.0006680806477864583, "learning_rate": 0.0001, "loss": 1.4828, "loss/crossentropy": 2.8710482120513916, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1936892718076706, "step": 6079 }, { "epoch": 0.0907875973390872, "grad_norm": 0.34375, "grad_norm_var": 0.0007067203521728515, "learning_rate": 0.0001, "loss": 1.4659, "loss/crossentropy": 2.5058178901672363, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18853305280208588, "step": 6080 }, { "epoch": 0.09080252950970218, "grad_norm": 0.423828125, "grad_norm_var": 0.0008969465891520182, "learning_rate": 0.0001, "loss": 1.5988, "loss/crossentropy": 2.6724218130111694, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.21601977944374084, "step": 6081 }, { "epoch": 0.09081746168031717, "grad_norm": 0.365234375, "grad_norm_var": 0.0008610407511393229, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.7528902292251587, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2025744691491127, "step": 6082 }, { "epoch": 0.09083239385093214, "grad_norm": 0.341796875, "grad_norm_var": 0.0009106794993082682, "learning_rate": 0.0001, "loss": 1.5424, "loss/crossentropy": 2.4195395708084106, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.21040546894073486, "step": 6083 }, { "epoch": 0.09084732602154712, "grad_norm": 0.32421875, "grad_norm_var": 0.0010441462198893228, "learning_rate": 0.0001, "loss": 1.3896, "loss/crossentropy": 2.5852750539779663, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17861932516098022, "step": 6084 }, { "epoch": 0.09086225819216211, "grad_norm": 0.33203125, "grad_norm_var": 0.0011245091756184897, "learning_rate": 0.0001, "loss": 1.3442, "loss/crossentropy": 2.5645323991775513, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16056393086910248, "step": 6085 }, { "epoch": 0.09087719036277708, "grad_norm": 0.3671875, "grad_norm_var": 0.0010943094889322917, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.521100878715515, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16477833688259125, "step": 6086 }, { "epoch": 0.09089212253339207, "grad_norm": 0.38671875, "grad_norm_var": 0.0010518232981363933, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.6766538619995117, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20123738050460815, "step": 6087 }, { "epoch": 0.09090705470400705, "grad_norm": 0.37109375, "grad_norm_var": 0.001016855239868164, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.546402335166931, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18044042587280273, "step": 6088 }, { "epoch": 0.09092198687462202, "grad_norm": 0.326171875, "grad_norm_var": 0.001090240478515625, "learning_rate": 0.0001, "loss": 1.4629, "loss/crossentropy": 2.308202385902405, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.17778632044792175, "step": 6089 }, { "epoch": 0.09093691904523701, "grad_norm": 0.341796875, "grad_norm_var": 0.000766754150390625, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.594040036201477, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2338205873966217, "step": 6090 }, { "epoch": 0.09095185121585199, "grad_norm": 0.33984375, "grad_norm_var": 0.000736236572265625, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.574148654937744, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18929476290941238, "step": 6091 }, { "epoch": 0.09096678338646698, "grad_norm": 0.361328125, "grad_norm_var": 0.0006992975870768229, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.4976929426193237, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18008922040462494, "step": 6092 }, { "epoch": 0.09098171555708195, "grad_norm": 0.40234375, "grad_norm_var": 0.0008149305979410807, "learning_rate": 0.0001, "loss": 1.5707, "loss/crossentropy": 2.714372754096985, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22308270633220673, "step": 6093 }, { "epoch": 0.09099664772769693, "grad_norm": 0.416015625, "grad_norm_var": 0.0009600162506103515, "learning_rate": 0.0001, "loss": 1.6936, "loss/crossentropy": 2.784578323364258, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.24442240595817566, "step": 6094 }, { "epoch": 0.09101157989831192, "grad_norm": 0.380859375, "grad_norm_var": 0.0009759902954101562, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.67927086353302, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2066243290901184, "step": 6095 }, { "epoch": 0.0910265120689269, "grad_norm": 0.396484375, "grad_norm_var": 0.0010073184967041016, "learning_rate": 0.0001, "loss": 1.6039, "loss/crossentropy": 2.606769323348999, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21326498687267303, "step": 6096 }, { "epoch": 0.09104144423954189, "grad_norm": 0.42578125, "grad_norm_var": 0.0010222752888997396, "learning_rate": 0.0001, "loss": 1.7314, "loss/crossentropy": 2.594874382019043, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.23920518159866333, "step": 6097 }, { "epoch": 0.09105637641015686, "grad_norm": 0.349609375, "grad_norm_var": 0.0010421117146809896, "learning_rate": 0.0001, "loss": 1.4796, "loss/crossentropy": 2.643768310546875, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19057543575763702, "step": 6098 }, { "epoch": 0.09107130858077184, "grad_norm": 0.35546875, "grad_norm_var": 0.0010088443756103515, "learning_rate": 0.0001, "loss": 1.4585, "loss/crossentropy": 2.802249312400818, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19291278719902039, "step": 6099 }, { "epoch": 0.09108624075138683, "grad_norm": 0.33984375, "grad_norm_var": 0.0009343306223551433, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.646655321121216, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1774023249745369, "step": 6100 }, { "epoch": 0.0911011729220018, "grad_norm": 0.408203125, "grad_norm_var": 0.0009287516276041667, "learning_rate": 0.0001, "loss": 1.6055, "loss/crossentropy": 2.3960962295532227, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22663778066635132, "step": 6101 }, { "epoch": 0.09111610509261679, "grad_norm": 0.419921875, "grad_norm_var": 0.0010613600413004558, "learning_rate": 0.0001, "loss": 1.588, "loss/crossentropy": 2.540359377861023, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2091054543852806, "step": 6102 }, { "epoch": 0.09113103726323177, "grad_norm": 0.37890625, "grad_norm_var": 0.0010543664296468098, "learning_rate": 0.0001, "loss": 1.6301, "loss/crossentropy": 2.601195454597473, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.23945478349924088, "step": 6103 }, { "epoch": 0.09114596943384676, "grad_norm": 0.37890625, "grad_norm_var": 0.0010532220204671225, "learning_rate": 0.0001, "loss": 1.5439, "loss/crossentropy": 2.545901894569397, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.19238600134849548, "step": 6104 }, { "epoch": 0.09116090160446173, "grad_norm": 0.54296875, "grad_norm_var": 0.0025405248006184896, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.7719119787216187, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2128063216805458, "step": 6105 }, { "epoch": 0.09117583377507671, "grad_norm": 0.39453125, "grad_norm_var": 0.0023761590321858725, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.637226700782776, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1877848207950592, "step": 6106 }, { "epoch": 0.0911907659456917, "grad_norm": 0.38671875, "grad_norm_var": 0.002180083592732747, "learning_rate": 0.0001, "loss": 1.5147, "loss/crossentropy": 2.720970869064331, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2100158929824829, "step": 6107 }, { "epoch": 0.09120569811630667, "grad_norm": 0.35546875, "grad_norm_var": 0.0022094090779622394, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.6433314085006714, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20191459357738495, "step": 6108 }, { "epoch": 0.09122063028692166, "grad_norm": 0.408203125, "grad_norm_var": 0.0022167046864827473, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.708917498588562, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.21157139539718628, "step": 6109 }, { "epoch": 0.09123556245753664, "grad_norm": 0.333984375, "grad_norm_var": 0.0024196465810139975, "learning_rate": 0.0001, "loss": 1.3706, "loss/crossentropy": 2.731690526008606, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.18700239062309265, "step": 6110 }, { "epoch": 0.09125049462815162, "grad_norm": 0.341796875, "grad_norm_var": 0.0025677839914957684, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.540285110473633, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17591877281665802, "step": 6111 }, { "epoch": 0.0912654267987666, "grad_norm": 0.40234375, "grad_norm_var": 0.002576128641764323, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.613221764564514, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.19203398376703262, "step": 6112 }, { "epoch": 0.09128035896938158, "grad_norm": 0.37890625, "grad_norm_var": 0.0024830500284830728, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.6293373107910156, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19864721596240997, "step": 6113 }, { "epoch": 0.09129529113999657, "grad_norm": 0.361328125, "grad_norm_var": 0.0024347941080729166, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.723051428794861, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18858088552951813, "step": 6114 }, { "epoch": 0.09131022331061155, "grad_norm": 0.3359375, "grad_norm_var": 0.0025400161743164063, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.657591462135315, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.16833172738552094, "step": 6115 }, { "epoch": 0.09132515548122652, "grad_norm": 0.359375, "grad_norm_var": 0.002444966634114583, "learning_rate": 0.0001, "loss": 1.4797, "loss/crossentropy": 2.49149751663208, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19456341117620468, "step": 6116 }, { "epoch": 0.09134008765184151, "grad_norm": 0.341796875, "grad_norm_var": 0.002530352274576823, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.5965592861175537, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19058074057102203, "step": 6117 }, { "epoch": 0.09135501982245649, "grad_norm": 0.3515625, "grad_norm_var": 0.002481953303019206, "learning_rate": 0.0001, "loss": 1.4941, "loss/crossentropy": 3.069920063018799, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1972174346446991, "step": 6118 }, { "epoch": 0.09136995199307148, "grad_norm": 0.341796875, "grad_norm_var": 0.00256500244140625, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.6071066856384277, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18342091143131256, "step": 6119 }, { "epoch": 0.09138488416368645, "grad_norm": 0.353515625, "grad_norm_var": 0.002595376968383789, "learning_rate": 0.0001, "loss": 1.4651, "loss/crossentropy": 2.6016217470169067, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.17989853769540787, "step": 6120 }, { "epoch": 0.09139981633430144, "grad_norm": 0.328125, "grad_norm_var": 0.0006511529286702474, "learning_rate": 0.0001, "loss": 1.4541, "loss/crossentropy": 2.699900507926941, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19632237404584885, "step": 6121 }, { "epoch": 0.09141474850491642, "grad_norm": 0.33203125, "grad_norm_var": 0.0006155490875244141, "learning_rate": 0.0001, "loss": 1.502, "loss/crossentropy": 2.6680771112442017, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2050970271229744, "step": 6122 }, { "epoch": 0.0914296806755314, "grad_norm": 0.384765625, "grad_norm_var": 0.000608062744140625, "learning_rate": 0.0001, "loss": 1.534, "loss/crossentropy": 2.676210641860962, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20196563005447388, "step": 6123 }, { "epoch": 0.09144461284614638, "grad_norm": 0.373046875, "grad_norm_var": 0.0006239414215087891, "learning_rate": 0.0001, "loss": 1.5922, "loss/crossentropy": 2.671533226966858, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.2250610888004303, "step": 6124 }, { "epoch": 0.09145954501676136, "grad_norm": 0.47265625, "grad_norm_var": 0.0013147354125976562, "learning_rate": 0.0001, "loss": 1.5435, "loss/crossentropy": 2.702773094177246, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.21146205067634583, "step": 6125 }, { "epoch": 0.09147447718737635, "grad_norm": 0.39453125, "grad_norm_var": 0.0013171990712483725, "learning_rate": 0.0001, "loss": 1.4387, "loss/crossentropy": 2.5532472133636475, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1808386743068695, "step": 6126 }, { "epoch": 0.09148940935799132, "grad_norm": 0.431640625, "grad_norm_var": 0.0015336195627848306, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.5550211668014526, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.19529235363006592, "step": 6127 }, { "epoch": 0.0915043415286063, "grad_norm": 0.3671875, "grad_norm_var": 0.0014660994211832682, "learning_rate": 0.0001, "loss": 1.4768, "loss/crossentropy": 2.5837544202804565, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20331839472055435, "step": 6128 }, { "epoch": 0.09151927369922129, "grad_norm": 0.376953125, "grad_norm_var": 0.0014638264973958334, "learning_rate": 0.0001, "loss": 1.5754, "loss/crossentropy": 2.5792754888534546, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.24337287992238998, "step": 6129 }, { "epoch": 0.09153420586983627, "grad_norm": 0.337890625, "grad_norm_var": 0.0015225728352864583, "learning_rate": 0.0001, "loss": 1.4383, "loss/crossentropy": 2.7081764936447144, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18434644490480423, "step": 6130 }, { "epoch": 0.09154913804045126, "grad_norm": 0.4140625, "grad_norm_var": 0.0015734354654947916, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.6279131174087524, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22987257689237595, "step": 6131 }, { "epoch": 0.09156407021106623, "grad_norm": 0.357421875, "grad_norm_var": 0.0015771071116129558, "learning_rate": 0.0001, "loss": 1.5643, "loss/crossentropy": 2.7730672359466553, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22833939641714096, "step": 6132 }, { "epoch": 0.09157900238168121, "grad_norm": 0.365234375, "grad_norm_var": 0.0015156904856363932, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.6846978664398193, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1819649040699005, "step": 6133 }, { "epoch": 0.0915939345522962, "grad_norm": 0.384765625, "grad_norm_var": 0.0014856974283854166, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.6400567293167114, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20506147295236588, "step": 6134 }, { "epoch": 0.09160886672291117, "grad_norm": 0.40234375, "grad_norm_var": 0.0014388879140218099, "learning_rate": 0.0001, "loss": 1.4895, "loss/crossentropy": 2.770833730697632, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20436853915452957, "step": 6135 }, { "epoch": 0.09162379889352616, "grad_norm": 0.33203125, "grad_norm_var": 0.0015429178873697916, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.5460429191589355, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.1699802130460739, "step": 6136 }, { "epoch": 0.09163873106414114, "grad_norm": 0.34375, "grad_norm_var": 0.001453399658203125, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.6409910917282104, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17610055208206177, "step": 6137 }, { "epoch": 0.09165366323475611, "grad_norm": 0.3671875, "grad_norm_var": 0.0013086318969726563, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.738284468650818, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.1948104128241539, "step": 6138 }, { "epoch": 0.0916685954053711, "grad_norm": 0.33203125, "grad_norm_var": 0.001460123062133789, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.4894973039627075, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.1743941754102707, "step": 6139 }, { "epoch": 0.09168352757598608, "grad_norm": 0.3515625, "grad_norm_var": 0.0015040079752604166, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.524847626686096, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1916450560092926, "step": 6140 }, { "epoch": 0.09169845974660107, "grad_norm": 0.35546875, "grad_norm_var": 0.0008669535319010417, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.639094114303589, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.20200015604496002, "step": 6141 }, { "epoch": 0.09171339191721604, "grad_norm": 0.42578125, "grad_norm_var": 0.0010317484537760416, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.7036246061325073, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17653214186429977, "step": 6142 }, { "epoch": 0.09172832408783103, "grad_norm": 0.37109375, "grad_norm_var": 0.0007760206858317058, "learning_rate": 0.0001, "loss": 1.4507, "loss/crossentropy": 2.577373743057251, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1812090426683426, "step": 6143 }, { "epoch": 0.09174325625844601, "grad_norm": 0.38671875, "grad_norm_var": 0.0007982730865478515, "learning_rate": 0.0001, "loss": 1.543, "loss/crossentropy": 2.427199363708496, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21491242945194244, "step": 6144 }, { "epoch": 0.09175818842906099, "grad_norm": 0.3671875, "grad_norm_var": 0.0007939020792643229, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.4696022272109985, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.18729469180107117, "step": 6145 }, { "epoch": 0.09177312059967598, "grad_norm": 0.32421875, "grad_norm_var": 0.000861215591430664, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.4250099658966064, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.17530125379562378, "step": 6146 }, { "epoch": 0.09178805277029095, "grad_norm": 0.380859375, "grad_norm_var": 0.0007242202758789062, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.5936636924743652, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1783129870891571, "step": 6147 }, { "epoch": 0.09180298494090594, "grad_norm": 0.3828125, "grad_norm_var": 0.0007372379302978515, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.6442079544067383, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18669361621141434, "step": 6148 }, { "epoch": 0.09181791711152092, "grad_norm": 0.37109375, "grad_norm_var": 0.0007379531860351562, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.6609199047088623, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17638975381851196, "step": 6149 }, { "epoch": 0.09183284928213589, "grad_norm": 0.3828125, "grad_norm_var": 0.0007336775461832683, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.0852418541908264, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.18872031569480896, "step": 6150 }, { "epoch": 0.09184778145275088, "grad_norm": 0.404296875, "grad_norm_var": 0.0007430394490559896, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.724387049674988, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21379423141479492, "step": 6151 }, { "epoch": 0.09186271362336586, "grad_norm": 0.416015625, "grad_norm_var": 0.0007874647776285808, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.4562207460403442, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19198894500732422, "step": 6152 }, { "epoch": 0.09187764579398085, "grad_norm": 0.375, "grad_norm_var": 0.0007279555002848308, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.618786931037903, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21865315735340118, "step": 6153 }, { "epoch": 0.09189257796459582, "grad_norm": 0.328125, "grad_norm_var": 0.0008621056874593099, "learning_rate": 0.0001, "loss": 1.4758, "loss/crossentropy": 2.7216498851776123, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19841836392879486, "step": 6154 }, { "epoch": 0.0919075101352108, "grad_norm": 0.349609375, "grad_norm_var": 0.0007872899373372396, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.4983357191085815, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18149007111787796, "step": 6155 }, { "epoch": 0.09192244230582579, "grad_norm": 0.40234375, "grad_norm_var": 0.0008013407389322917, "learning_rate": 0.0001, "loss": 1.6028, "loss/crossentropy": 2.6362966299057007, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.21994341164827347, "step": 6156 }, { "epoch": 0.09193737447644076, "grad_norm": 0.357421875, "grad_norm_var": 0.0007961114247639974, "learning_rate": 0.0001, "loss": 1.562, "loss/crossentropy": 2.604239821434021, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21438545733690262, "step": 6157 }, { "epoch": 0.09195230664705575, "grad_norm": 0.37890625, "grad_norm_var": 0.0006259759267171224, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.4240533113479614, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.20982126891613007, "step": 6158 }, { "epoch": 0.09196723881767073, "grad_norm": 0.5625, "grad_norm_var": 0.00285032590230306, "learning_rate": 0.0001, "loss": 1.6584, "loss/crossentropy": 2.482264280319214, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.205306775867939, "step": 6159 }, { "epoch": 0.0919821709882857, "grad_norm": 0.341796875, "grad_norm_var": 0.0029698689778645832, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.568082094192505, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17777322232723236, "step": 6160 }, { "epoch": 0.0919971031589007, "grad_norm": 0.5078125, "grad_norm_var": 0.003912862141927083, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.851647734642029, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20814060419797897, "step": 6161 }, { "epoch": 0.09201203532951567, "grad_norm": 0.359375, "grad_norm_var": 0.0036742528279622394, "learning_rate": 0.0001, "loss": 1.578, "loss/crossentropy": 2.4439263343811035, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22252894192934036, "step": 6162 }, { "epoch": 0.09202696750013066, "grad_norm": 0.326171875, "grad_norm_var": 0.003955523173014323, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.321960926055908, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20554906874895096, "step": 6163 }, { "epoch": 0.09204189967074564, "grad_norm": 0.384765625, "grad_norm_var": 0.003953790664672852, "learning_rate": 0.0001, "loss": 1.6458, "loss/crossentropy": 2.4592182636260986, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.2278415858745575, "step": 6164 }, { "epoch": 0.09205683184136063, "grad_norm": 0.3984375, "grad_norm_var": 0.003929758071899414, "learning_rate": 0.0001, "loss": 1.4959, "loss/crossentropy": 2.7820165157318115, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.2107861042022705, "step": 6165 }, { "epoch": 0.0920717640119756, "grad_norm": 0.3984375, "grad_norm_var": 0.003925434748331706, "learning_rate": 0.0001, "loss": 1.7875, "loss/crossentropy": 2.398226737976074, "loss/fcd": 1.5078125, "loss/idx": 10.0, "loss/logits": 0.2797267287969589, "step": 6166 }, { "epoch": 0.09208669618259058, "grad_norm": 0.515625, "grad_norm_var": 0.004864947001139323, "learning_rate": 0.0001, "loss": 1.7101, "loss/crossentropy": 2.5043424367904663, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.2335553988814354, "step": 6167 }, { "epoch": 0.09210162835320557, "grad_norm": 0.494140625, "grad_norm_var": 0.005411720275878907, "learning_rate": 0.0001, "loss": 1.8019, "loss/crossentropy": 2.2200326919555664, "loss/fcd": 1.52734375, "loss/idx": 10.0, "loss/logits": 0.27459659427404404, "step": 6168 }, { "epoch": 0.09211656052382054, "grad_norm": 0.392578125, "grad_norm_var": 0.005360651016235352, "learning_rate": 0.0001, "loss": 1.7231, "loss/crossentropy": 2.4139145612716675, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.2465568631887436, "step": 6169 }, { "epoch": 0.09213149269443553, "grad_norm": 0.365234375, "grad_norm_var": 0.005060768127441407, "learning_rate": 0.0001, "loss": 1.5175, "loss/crossentropy": 2.5248111486434937, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21276773512363434, "step": 6170 }, { "epoch": 0.09214642486505051, "grad_norm": 0.353515625, "grad_norm_var": 0.005031077067057291, "learning_rate": 0.0001, "loss": 1.5785, "loss/crossentropy": 2.3988213539123535, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20346258580684662, "step": 6171 }, { "epoch": 0.09216135703566548, "grad_norm": 0.478515625, "grad_norm_var": 0.005329243342081706, "learning_rate": 0.0001, "loss": 1.961, "loss/crossentropy": 2.258615732192993, "loss/fcd": 1.69140625, "loss/idx": 10.0, "loss/logits": 0.2695552334189415, "step": 6172 }, { "epoch": 0.09217628920628047, "grad_norm": 0.337890625, "grad_norm_var": 0.005498997370402018, "learning_rate": 0.0001, "loss": 1.5516, "loss/crossentropy": 2.63633131980896, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2195233851671219, "step": 6173 }, { "epoch": 0.09219122137689545, "grad_norm": 0.373046875, "grad_norm_var": 0.005527178446451823, "learning_rate": 0.0001, "loss": 1.5063, "loss/crossentropy": 2.7372167110443115, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20556242763996124, "step": 6174 }, { "epoch": 0.09220615354751044, "grad_norm": 0.337890625, "grad_norm_var": 0.004169066747029622, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.739104151725769, "loss/fcd": 1.125, "loss/idx": 10.0, "loss/logits": 0.14922717213630676, "step": 6175 }, { "epoch": 0.09222108571812541, "grad_norm": 0.33984375, "grad_norm_var": 0.0041838963826497395, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.467579960823059, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.16688791662454605, "step": 6176 }, { "epoch": 0.09223601788874039, "grad_norm": 0.33203125, "grad_norm_var": 0.0035344441731770832, "learning_rate": 0.0001, "loss": 1.4555, "loss/crossentropy": 2.051989257335663, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17819702625274658, "step": 6177 }, { "epoch": 0.09225095005935538, "grad_norm": 0.3984375, "grad_norm_var": 0.003487396240234375, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.364523410797119, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19081460684537888, "step": 6178 }, { "epoch": 0.09226588222997036, "grad_norm": 0.365234375, "grad_norm_var": 0.00325469970703125, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.8128284215927124, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17952275276184082, "step": 6179 }, { "epoch": 0.09228081440058535, "grad_norm": 0.3828125, "grad_norm_var": 0.003256718317667643, "learning_rate": 0.0001, "loss": 1.4651, "loss/crossentropy": 2.6740912199020386, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19947214424610138, "step": 6180 }, { "epoch": 0.09229574657120032, "grad_norm": 0.39453125, "grad_norm_var": 0.003254048029581706, "learning_rate": 0.0001, "loss": 1.6213, "loss/crossentropy": 2.7978971004486084, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23454274237155914, "step": 6181 }, { "epoch": 0.09231067874181531, "grad_norm": 0.359375, "grad_norm_var": 0.003311904271443685, "learning_rate": 0.0001, "loss": 1.4802, "loss/crossentropy": 2.8427035808563232, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20678813755512238, "step": 6182 }, { "epoch": 0.09232561091243029, "grad_norm": 0.384765625, "grad_norm_var": 0.002169227600097656, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.1500304341316223, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.17739716172218323, "step": 6183 }, { "epoch": 0.09234054308304526, "grad_norm": 0.359375, "grad_norm_var": 0.0012644290924072265, "learning_rate": 0.0001, "loss": 1.5712, "loss/crossentropy": 2.348651170730591, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.2079424485564232, "step": 6184 }, { "epoch": 0.09235547525366025, "grad_norm": 0.369140625, "grad_norm_var": 0.001235055923461914, "learning_rate": 0.0001, "loss": 1.5147, "loss/crossentropy": 2.727397918701172, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20223147422075272, "step": 6185 }, { "epoch": 0.09237040742427523, "grad_norm": 0.373046875, "grad_norm_var": 0.0012331485748291015, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.5263423919677734, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1856403723359108, "step": 6186 }, { "epoch": 0.09238533959489022, "grad_norm": 0.34375, "grad_norm_var": 0.0012621561686197917, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.5541250705718994, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.213619664311409, "step": 6187 }, { "epoch": 0.09240027176550519, "grad_norm": 0.3515625, "grad_norm_var": 0.0004428704579671224, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.481335997581482, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.15953267365694046, "step": 6188 }, { "epoch": 0.09241520393612017, "grad_norm": 0.390625, "grad_norm_var": 0.0004424413045247396, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.561695098876953, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18257316946983337, "step": 6189 }, { "epoch": 0.09243013610673516, "grad_norm": 0.345703125, "grad_norm_var": 0.0004633585611979167, "learning_rate": 0.0001, "loss": 1.5338, "loss/crossentropy": 2.804479479789734, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2213274985551834, "step": 6190 }, { "epoch": 0.09244506827735013, "grad_norm": 0.341796875, "grad_norm_var": 0.0004505793253580729, "learning_rate": 0.0001, "loss": 1.4637, "loss/crossentropy": 2.5183826684951782, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19023635983467102, "step": 6191 }, { "epoch": 0.09246000044796512, "grad_norm": 0.318359375, "grad_norm_var": 0.0005500634511311849, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.4802610874176025, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17995479702949524, "step": 6192 }, { "epoch": 0.0924749326185801, "grad_norm": 0.388671875, "grad_norm_var": 0.0005154927571614583, "learning_rate": 0.0001, "loss": 1.6849, "loss/crossentropy": 2.5962514877319336, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.2552068382501602, "step": 6193 }, { "epoch": 0.09248986478919508, "grad_norm": 0.32421875, "grad_norm_var": 0.0005456924438476563, "learning_rate": 0.0001, "loss": 1.4727, "loss/crossentropy": 2.4628801345825195, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.17974958568811417, "step": 6194 }, { "epoch": 0.09250479695981007, "grad_norm": 0.3203125, "grad_norm_var": 0.0006528059641520182, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.7474584579467773, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18624259531497955, "step": 6195 }, { "epoch": 0.09251972913042504, "grad_norm": 0.3515625, "grad_norm_var": 0.0006156762440999349, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.665806531906128, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21109240502119064, "step": 6196 }, { "epoch": 0.09253466130104003, "grad_norm": 0.369140625, "grad_norm_var": 0.0005299250284830729, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.5981693267822266, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1707129329442978, "step": 6197 }, { "epoch": 0.092549593471655, "grad_norm": 0.384765625, "grad_norm_var": 0.0005826155344645182, "learning_rate": 0.0001, "loss": 1.6679, "loss/crossentropy": 2.591779589653015, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.24211319535970688, "step": 6198 }, { "epoch": 0.09256452564226998, "grad_norm": 0.361328125, "grad_norm_var": 0.0005311171213785807, "learning_rate": 0.0001, "loss": 1.4871, "loss/crossentropy": 2.6996911764144897, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19802434742450714, "step": 6199 }, { "epoch": 0.09257945781288497, "grad_norm": 0.357421875, "grad_norm_var": 0.0005304336547851563, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.572067975997925, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21364359557628632, "step": 6200 }, { "epoch": 0.09259438998349995, "grad_norm": 0.390625, "grad_norm_var": 0.0005977471669514974, "learning_rate": 0.0001, "loss": 1.6585, "loss/crossentropy": 2.4908641576766968, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.22494134306907654, "step": 6201 }, { "epoch": 0.09260932215411494, "grad_norm": 0.40234375, "grad_norm_var": 0.0007138570149739583, "learning_rate": 0.0001, "loss": 1.7923, "loss/crossentropy": 2.4398876428604126, "loss/fcd": 1.55078125, "loss/idx": 10.0, "loss/logits": 0.241511270403862, "step": 6202 }, { "epoch": 0.09262425432472991, "grad_norm": 0.3359375, "grad_norm_var": 0.0007334391276041667, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.6230591535568237, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21257608383893967, "step": 6203 }, { "epoch": 0.0926391864953449, "grad_norm": 0.361328125, "grad_norm_var": 0.0007304986317952474, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.6281514167785645, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1973649188876152, "step": 6204 }, { "epoch": 0.09265411866595988, "grad_norm": 0.384765625, "grad_norm_var": 0.0007079442342122396, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.914362907409668, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20224007964134216, "step": 6205 }, { "epoch": 0.09266905083657485, "grad_norm": 0.419921875, "grad_norm_var": 0.0009241739908854167, "learning_rate": 0.0001, "loss": 1.3925, "loss/crossentropy": 2.778603434562683, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.16983112692832947, "step": 6206 }, { "epoch": 0.09268398300718984, "grad_norm": 0.326171875, "grad_norm_var": 0.00098419189453125, "learning_rate": 0.0001, "loss": 1.4321, "loss/crossentropy": 2.4183095693588257, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18987607955932617, "step": 6207 }, { "epoch": 0.09269891517780482, "grad_norm": 0.34375, "grad_norm_var": 0.0008757114410400391, "learning_rate": 0.0001, "loss": 1.5686, "loss/crossentropy": 2.6249682903289795, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20144574344158173, "step": 6208 }, { "epoch": 0.09271384734841981, "grad_norm": 0.361328125, "grad_norm_var": 0.0008320967356363932, "learning_rate": 0.0001, "loss": 1.4879, "loss/crossentropy": 2.5253041982650757, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20277002453804016, "step": 6209 }, { "epoch": 0.09272877951903478, "grad_norm": 0.390625, "grad_norm_var": 0.0007715702056884765, "learning_rate": 0.0001, "loss": 1.5517, "loss/crossentropy": 2.59028697013855, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21190239489078522, "step": 6210 }, { "epoch": 0.09274371168964976, "grad_norm": 0.36328125, "grad_norm_var": 0.0006233056386311849, "learning_rate": 0.0001, "loss": 1.6111, "loss/crossentropy": 2.5772756338119507, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.23607076704502106, "step": 6211 }, { "epoch": 0.09275864386026475, "grad_norm": 0.3515625, "grad_norm_var": 0.0006233056386311849, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.5275378227233887, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18268971145153046, "step": 6212 }, { "epoch": 0.09277357603087973, "grad_norm": 0.404296875, "grad_norm_var": 0.0007011254628499348, "learning_rate": 0.0001, "loss": 1.6273, "loss/crossentropy": 2.8365156650543213, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.25230487436056137, "step": 6213 }, { "epoch": 0.09278850820149472, "grad_norm": 0.3515625, "grad_norm_var": 0.0007100423177083333, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.5237698554992676, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19777953624725342, "step": 6214 }, { "epoch": 0.09280344037210969, "grad_norm": 0.396484375, "grad_norm_var": 0.0007506688435872396, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.651079773902893, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20437506586313248, "step": 6215 }, { "epoch": 0.09281837254272467, "grad_norm": 0.330078125, "grad_norm_var": 0.0008481343587239584, "learning_rate": 0.0001, "loss": 1.4536, "loss/crossentropy": 2.7472262382507324, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1958327442407608, "step": 6216 }, { "epoch": 0.09283330471333966, "grad_norm": 0.361328125, "grad_norm_var": 0.0008197625478108724, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.7247647047042847, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.19363564252853394, "step": 6217 }, { "epoch": 0.09284823688395463, "grad_norm": 0.357421875, "grad_norm_var": 0.0007389704386393229, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.5550774335861206, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.22325635701417923, "step": 6218 }, { "epoch": 0.09286316905456962, "grad_norm": 0.37890625, "grad_norm_var": 0.0006879170735677084, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.5691280364990234, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1717609316110611, "step": 6219 }, { "epoch": 0.0928781012251846, "grad_norm": 0.34765625, "grad_norm_var": 0.0007111708323160808, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.4686696529388428, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16504253447055817, "step": 6220 }, { "epoch": 0.09289303339579957, "grad_norm": 0.359375, "grad_norm_var": 0.0006907145182291667, "learning_rate": 0.0001, "loss": 1.4463, "loss/crossentropy": 2.7060861587524414, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18070931732654572, "step": 6221 }, { "epoch": 0.09290796556641456, "grad_norm": 0.35546875, "grad_norm_var": 0.0004803816477457682, "learning_rate": 0.0001, "loss": 1.5251, "loss/crossentropy": 2.557792067527771, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20483481138944626, "step": 6222 }, { "epoch": 0.09292289773702954, "grad_norm": 0.40234375, "grad_norm_var": 0.0004872004191080729, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.6079829931259155, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21466585993766785, "step": 6223 }, { "epoch": 0.09293782990764453, "grad_norm": 0.35546875, "grad_norm_var": 0.00046106974283854164, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.4879013299942017, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.22638390213251114, "step": 6224 }, { "epoch": 0.0929527620782595, "grad_norm": 0.33984375, "grad_norm_var": 0.0005053043365478516, "learning_rate": 0.0001, "loss": 1.4651, "loss/crossentropy": 2.500460386276245, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19945623725652695, "step": 6225 }, { "epoch": 0.0929676942488745, "grad_norm": 0.365234375, "grad_norm_var": 0.000460052490234375, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.5954395532608032, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17612306773662567, "step": 6226 }, { "epoch": 0.09298262641948947, "grad_norm": 0.337890625, "grad_norm_var": 0.0005019982655843099, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.508178472518921, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19047745317220688, "step": 6227 }, { "epoch": 0.09299755859010445, "grad_norm": 0.34765625, "grad_norm_var": 0.0005084832509358723, "learning_rate": 0.0001, "loss": 1.546, "loss/crossentropy": 2.4413628578186035, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.1983048915863037, "step": 6228 }, { "epoch": 0.09301249076071944, "grad_norm": 0.35546875, "grad_norm_var": 0.00038172403971354165, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.6360023021698, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.18093281239271164, "step": 6229 }, { "epoch": 0.09302742293133441, "grad_norm": 0.78515625, "grad_norm_var": 0.011708513895670573, "learning_rate": 0.0001, "loss": 1.5611, "loss/crossentropy": 2.583914875984192, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21347416937351227, "step": 6230 }, { "epoch": 0.0930423551019494, "grad_norm": 0.3671875, "grad_norm_var": 0.011721150080362955, "learning_rate": 0.0001, "loss": 1.6531, "loss/crossentropy": 2.498905062675476, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.23511683195829391, "step": 6231 }, { "epoch": 0.09305728727256438, "grad_norm": 0.37109375, "grad_norm_var": 0.011530558268229166, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.9456393718719482, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19567697495222092, "step": 6232 }, { "epoch": 0.09307221944317935, "grad_norm": 0.39453125, "grad_norm_var": 0.011487054824829101, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.7902534008026123, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.19194715470075607, "step": 6233 }, { "epoch": 0.09308715161379434, "grad_norm": 0.42578125, "grad_norm_var": 0.011493174235026042, "learning_rate": 0.0001, "loss": 1.66, "loss/crossentropy": 2.52525532245636, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2381007820367813, "step": 6234 }, { "epoch": 0.09310208378440932, "grad_norm": 0.3359375, "grad_norm_var": 0.011689694722493489, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.5950952768325806, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19276807457208633, "step": 6235 }, { "epoch": 0.09311701595502431, "grad_norm": 0.392578125, "grad_norm_var": 0.01155991554260254, "learning_rate": 0.0001, "loss": 1.5454, "loss/crossentropy": 2.4661455154418945, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.23681434988975525, "step": 6236 }, { "epoch": 0.09313194812563928, "grad_norm": 0.34375, "grad_norm_var": 0.011645619074503582, "learning_rate": 0.0001, "loss": 1.3809, "loss/crossentropy": 2.7269537448883057, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17385590076446533, "step": 6237 }, { "epoch": 0.09314688029625426, "grad_norm": 0.33984375, "grad_norm_var": 0.011737426122029623, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.6472924947738647, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1771361529827118, "step": 6238 }, { "epoch": 0.09316181246686925, "grad_norm": 0.314453125, "grad_norm_var": 0.012090047200520834, "learning_rate": 0.0001, "loss": 1.3442, "loss/crossentropy": 2.4986380338668823, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1567120999097824, "step": 6239 }, { "epoch": 0.09317674463748422, "grad_norm": 0.416015625, "grad_norm_var": 0.012074772516886394, "learning_rate": 0.0001, "loss": 1.723, "loss/crossentropy": 2.409444212913513, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.27381160855293274, "step": 6240 }, { "epoch": 0.09319167680809921, "grad_norm": 0.310546875, "grad_norm_var": 0.012322489420572917, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.660559892654419, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1611868292093277, "step": 6241 }, { "epoch": 0.09320660897871419, "grad_norm": 0.35546875, "grad_norm_var": 0.012357695897420248, "learning_rate": 0.0001, "loss": 1.5956, "loss/crossentropy": 2.394827961921692, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2245483323931694, "step": 6242 }, { "epoch": 0.09322154114932918, "grad_norm": 0.373046875, "grad_norm_var": 0.012204345067342122, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.7200270891189575, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20652847737073898, "step": 6243 }, { "epoch": 0.09323647331994415, "grad_norm": 0.353515625, "grad_norm_var": 0.012173970540364584, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 2.6427773237228394, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20044496655464172, "step": 6244 }, { "epoch": 0.09325140549055913, "grad_norm": 0.408203125, "grad_norm_var": 0.012107451756795248, "learning_rate": 0.0001, "loss": 1.5778, "loss/crossentropy": 2.6289217472076416, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.23010224103927612, "step": 6245 }, { "epoch": 0.09326633766117412, "grad_norm": 0.333984375, "grad_norm_var": 0.0012357076009114583, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.703512668609619, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17365802079439163, "step": 6246 }, { "epoch": 0.0932812698317891, "grad_norm": 0.494140625, "grad_norm_var": 0.0022843519846598307, "learning_rate": 0.0001, "loss": 1.8658, "loss/crossentropy": 2.4520113468170166, "loss/fcd": 1.6171875, "loss/idx": 10.0, "loss/logits": 0.24861449003219604, "step": 6247 }, { "epoch": 0.09329620200240409, "grad_norm": 0.34375, "grad_norm_var": 0.0023368676503499348, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.625216841697693, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.18191814422607422, "step": 6248 }, { "epoch": 0.09331113417301906, "grad_norm": 0.412109375, "grad_norm_var": 0.0024113972981770833, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.618574619293213, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.20609420537948608, "step": 6249 }, { "epoch": 0.09332606634363404, "grad_norm": 0.34375, "grad_norm_var": 0.0022445042928059896, "learning_rate": 0.0001, "loss": 1.4742, "loss/crossentropy": 2.668047070503235, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1890578418970108, "step": 6250 }, { "epoch": 0.09334099851424903, "grad_norm": 0.369140625, "grad_norm_var": 0.0021761417388916015, "learning_rate": 0.0001, "loss": 1.5934, "loss/crossentropy": 2.5718398094177246, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.23010985553264618, "step": 6251 }, { "epoch": 0.093355930684864, "grad_norm": 0.40234375, "grad_norm_var": 0.002212778727213542, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.648694634437561, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.1951594278216362, "step": 6252 }, { "epoch": 0.09337086285547899, "grad_norm": 0.322265625, "grad_norm_var": 0.0023157596588134766, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.4114502668380737, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16420136392116547, "step": 6253 }, { "epoch": 0.09338579502609397, "grad_norm": 0.36328125, "grad_norm_var": 0.0022612094879150392, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.5511285066604614, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2460121065378189, "step": 6254 }, { "epoch": 0.09340072719670894, "grad_norm": 0.421875, "grad_norm_var": 0.002190399169921875, "learning_rate": 0.0001, "loss": 1.5766, "loss/crossentropy": 2.6165863275527954, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20939771085977554, "step": 6255 }, { "epoch": 0.09341565936732393, "grad_norm": 0.3671875, "grad_norm_var": 0.0020819187164306642, "learning_rate": 0.0001, "loss": 1.4954, "loss/crossentropy": 2.56782329082489, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19852468371391296, "step": 6256 }, { "epoch": 0.09343059153793891, "grad_norm": 0.408203125, "grad_norm_var": 0.0018593947092692058, "learning_rate": 0.0001, "loss": 1.4241, "loss/crossentropy": 2.696123480796814, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.16235092282295227, "step": 6257 }, { "epoch": 0.0934455237085539, "grad_norm": 0.4609375, "grad_norm_var": 0.002216450373331706, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.3568607568740845, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.186987042427063, "step": 6258 }, { "epoch": 0.09346045587916887, "grad_norm": 0.328125, "grad_norm_var": 0.002420806884765625, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.622328996658325, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17998357862234116, "step": 6259 }, { "epoch": 0.09347538804978385, "grad_norm": 0.3828125, "grad_norm_var": 0.002358102798461914, "learning_rate": 0.0001, "loss": 1.52, "loss/crossentropy": 2.630080461502075, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20751222968101501, "step": 6260 }, { "epoch": 0.09349032022039884, "grad_norm": 0.3671875, "grad_norm_var": 0.002337074279785156, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.41320264339447, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17958928644657135, "step": 6261 }, { "epoch": 0.09350525239101382, "grad_norm": 0.396484375, "grad_norm_var": 0.002176348368326823, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.8020814657211304, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1935502141714096, "step": 6262 }, { "epoch": 0.0935201845616288, "grad_norm": 0.33984375, "grad_norm_var": 0.0014493147532145183, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.5621238946914673, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.22255174815654755, "step": 6263 }, { "epoch": 0.09353511673224378, "grad_norm": 0.33984375, "grad_norm_var": 0.0014674981435139974, "learning_rate": 0.0001, "loss": 1.5537, "loss/crossentropy": 2.4299721717834473, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20607322454452515, "step": 6264 }, { "epoch": 0.09355004890285877, "grad_norm": 0.333984375, "grad_norm_var": 0.0014789422353108724, "learning_rate": 0.0001, "loss": 1.4893, "loss/crossentropy": 2.4382903575897217, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19245267659425735, "step": 6265 }, { "epoch": 0.09356498107347375, "grad_norm": 0.384765625, "grad_norm_var": 0.0014312108357747396, "learning_rate": 0.0001, "loss": 1.6014, "loss/crossentropy": 2.551361918449402, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21082012355327606, "step": 6266 }, { "epoch": 0.09357991324408872, "grad_norm": 0.330078125, "grad_norm_var": 0.0015532811482747396, "learning_rate": 0.0001, "loss": 1.4748, "loss/crossentropy": 2.5526859760284424, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.2013453170657158, "step": 6267 }, { "epoch": 0.09359484541470371, "grad_norm": 0.431640625, "grad_norm_var": 0.0017261346181233724, "learning_rate": 0.0001, "loss": 1.744, "loss/crossentropy": 2.4509412050247192, "loss/fcd": 1.5078125, "loss/idx": 10.0, "loss/logits": 0.2361718937754631, "step": 6268 }, { "epoch": 0.09360977758531869, "grad_norm": 0.361328125, "grad_norm_var": 0.0015538374582926432, "learning_rate": 0.0001, "loss": 1.5068, "loss/crossentropy": 2.845022678375244, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20606020092964172, "step": 6269 }, { "epoch": 0.09362470975593368, "grad_norm": 0.470703125, "grad_norm_var": 0.0020914713541666666, "learning_rate": 0.0001, "loss": 1.7365, "loss/crossentropy": 2.663253903388977, "loss/fcd": 1.50390625, "loss/idx": 10.0, "loss/logits": 0.23257745802402496, "step": 6270 }, { "epoch": 0.09363964192654865, "grad_norm": 0.443359375, "grad_norm_var": 0.002232217788696289, "learning_rate": 0.0001, "loss": 1.7396, "loss/crossentropy": 2.5925793647766113, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.27085772156715393, "step": 6271 }, { "epoch": 0.09365457409716363, "grad_norm": 0.326171875, "grad_norm_var": 0.002430152893066406, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.5264663696289062, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.18427396565675735, "step": 6272 }, { "epoch": 0.09366950626777862, "grad_norm": 0.3359375, "grad_norm_var": 0.002500136693318685, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.4892512559890747, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.15586986392736435, "step": 6273 }, { "epoch": 0.0936844384383936, "grad_norm": 0.37890625, "grad_norm_var": 0.00200346310933431, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.682263493537903, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18618719279766083, "step": 6274 }, { "epoch": 0.09369937060900858, "grad_norm": 0.357421875, "grad_norm_var": 0.0018859227498372396, "learning_rate": 0.0001, "loss": 1.5332, "loss/crossentropy": 2.6470718383789062, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20504220575094223, "step": 6275 }, { "epoch": 0.09371430277962356, "grad_norm": 0.38671875, "grad_norm_var": 0.0018915812174479167, "learning_rate": 0.0001, "loss": 1.5439, "loss/crossentropy": 2.600034713745117, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21964890509843826, "step": 6276 }, { "epoch": 0.09372923495023854, "grad_norm": 0.35546875, "grad_norm_var": 0.001910845438639323, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.7714121341705322, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1996714249253273, "step": 6277 }, { "epoch": 0.09374416712085352, "grad_norm": 0.474609375, "grad_norm_var": 0.0025339126586914062, "learning_rate": 0.0001, "loss": 1.8786, "loss/crossentropy": 2.355145573616028, "loss/fcd": 1.6015625, "loss/idx": 10.0, "loss/logits": 0.2770025506615639, "step": 6278 }, { "epoch": 0.0937590992914685, "grad_norm": 0.34765625, "grad_norm_var": 0.0024978001912434895, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.5762094259262085, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17990336567163467, "step": 6279 }, { "epoch": 0.09377403146208349, "grad_norm": 0.4609375, "grad_norm_var": 0.0027875264485677084, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.6556358337402344, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.22250070422887802, "step": 6280 }, { "epoch": 0.09378896363269847, "grad_norm": 0.421875, "grad_norm_var": 0.00265806516011556, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.7534399032592773, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.2807723432779312, "step": 6281 }, { "epoch": 0.09380389580331344, "grad_norm": 0.34765625, "grad_norm_var": 0.0027785619099934896, "learning_rate": 0.0001, "loss": 1.5631, "loss/crossentropy": 2.585623860359192, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21547582745552063, "step": 6282 }, { "epoch": 0.09381882797392843, "grad_norm": 0.396484375, "grad_norm_var": 0.002528889973958333, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.8082178831100464, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20322032272815704, "step": 6283 }, { "epoch": 0.09383376014454341, "grad_norm": 0.37109375, "grad_norm_var": 0.00245054562886556, "learning_rate": 0.0001, "loss": 1.4948, "loss/crossentropy": 2.6914632320404053, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.22134751081466675, "step": 6284 }, { "epoch": 0.0938486923151584, "grad_norm": 0.404296875, "grad_norm_var": 0.0024029890696207683, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.539798140525818, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1971389353275299, "step": 6285 }, { "epoch": 0.09386362448577337, "grad_norm": 0.42578125, "grad_norm_var": 0.0020604451497395833, "learning_rate": 0.0001, "loss": 1.4075, "loss/crossentropy": 2.8340041637420654, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1770005002617836, "step": 6286 }, { "epoch": 0.09387855665638836, "grad_norm": 0.365234375, "grad_norm_var": 0.0018824259440104166, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.547627329826355, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1872011050581932, "step": 6287 }, { "epoch": 0.09389348882700334, "grad_norm": 0.421875, "grad_norm_var": 0.0017071882883707681, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.416662096977234, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.20681830495595932, "step": 6288 }, { "epoch": 0.09390842099761831, "grad_norm": 0.423828125, "grad_norm_var": 0.0015476862589518228, "learning_rate": 0.0001, "loss": 1.5564, "loss/crossentropy": 2.6728659868240356, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21659383177757263, "step": 6289 }, { "epoch": 0.0939233531682333, "grad_norm": 0.43359375, "grad_norm_var": 0.0016082127888997395, "learning_rate": 0.0001, "loss": 1.5987, "loss/crossentropy": 2.489604353904724, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22372471541166306, "step": 6290 }, { "epoch": 0.09393828533884828, "grad_norm": 0.341796875, "grad_norm_var": 0.0017114639282226562, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.5064324140548706, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16522016376256943, "step": 6291 }, { "epoch": 0.09395321750946327, "grad_norm": 0.53515625, "grad_norm_var": 0.0028518040974934897, "learning_rate": 0.0001, "loss": 1.6518, "loss/crossentropy": 2.677892327308655, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.22602880001068115, "step": 6292 }, { "epoch": 0.09396814968007824, "grad_norm": 0.439453125, "grad_norm_var": 0.0027048587799072266, "learning_rate": 0.0001, "loss": 1.6314, "loss/crossentropy": 2.5195811986923218, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.225193589925766, "step": 6293 }, { "epoch": 0.09398308185069322, "grad_norm": 0.3359375, "grad_norm_var": 0.002771441141764323, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.607347011566162, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18564331531524658, "step": 6294 }, { "epoch": 0.09399801402130821, "grad_norm": 0.3671875, "grad_norm_var": 0.002647145589192708, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.6315999031066895, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21502207964658737, "step": 6295 }, { "epoch": 0.09401294619192319, "grad_norm": 0.4453125, "grad_norm_var": 0.002547454833984375, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.4989360570907593, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21206260472536087, "step": 6296 }, { "epoch": 0.09402787836253818, "grad_norm": 0.3828125, "grad_norm_var": 0.0025538126627604168, "learning_rate": 0.0001, "loss": 1.6054, "loss/crossentropy": 2.4389032125473022, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2264796644449234, "step": 6297 }, { "epoch": 0.09404281053315315, "grad_norm": 0.328125, "grad_norm_var": 0.002720069885253906, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.3168113231658936, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.1687261238694191, "step": 6298 }, { "epoch": 0.09405774270376813, "grad_norm": 0.353515625, "grad_norm_var": 0.0028620402018229166, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.6109853982925415, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2028978392481804, "step": 6299 }, { "epoch": 0.09407267487438312, "grad_norm": 0.359375, "grad_norm_var": 0.002913347880045573, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.873142719268799, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22978483140468597, "step": 6300 }, { "epoch": 0.09408760704499809, "grad_norm": 0.41015625, "grad_norm_var": 0.002920643488566081, "learning_rate": 0.0001, "loss": 1.6333, "loss/crossentropy": 2.4327701330184937, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.23098024725914001, "step": 6301 }, { "epoch": 0.09410253921561308, "grad_norm": 0.345703125, "grad_norm_var": 0.0030255635579427083, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.638652205467224, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19971778243780136, "step": 6302 }, { "epoch": 0.09411747138622806, "grad_norm": 0.412109375, "grad_norm_var": 0.0029889424641927082, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.4827029705047607, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.17871956527233124, "step": 6303 }, { "epoch": 0.09413240355684305, "grad_norm": 0.353515625, "grad_norm_var": 0.0030451297760009767, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.655813455581665, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18076995015144348, "step": 6304 }, { "epoch": 0.09414733572745802, "grad_norm": 0.33984375, "grad_norm_var": 0.0031264623006184897, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.6943602561950684, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.20647256076335907, "step": 6305 }, { "epoch": 0.094162267898073, "grad_norm": 0.37109375, "grad_norm_var": 0.002977943420410156, "learning_rate": 0.0001, "loss": 1.653, "loss/crossentropy": 2.477417469024658, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2546059191226959, "step": 6306 }, { "epoch": 0.09417720006868799, "grad_norm": 0.357421875, "grad_norm_var": 0.0029082616170247396, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.7589913606643677, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19219328463077545, "step": 6307 }, { "epoch": 0.09419213223930296, "grad_norm": 0.31640625, "grad_norm_var": 0.0014769872029622396, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.590673565864563, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.17164956033229828, "step": 6308 }, { "epoch": 0.09420706440991795, "grad_norm": 0.408203125, "grad_norm_var": 0.0012481053670247396, "learning_rate": 0.0001, "loss": 1.5393, "loss/crossentropy": 2.6792908906936646, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.2111673653125763, "step": 6309 }, { "epoch": 0.09422199658053293, "grad_norm": 0.34765625, "grad_norm_var": 0.0012067159016927083, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.5987547636032104, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.16931137442588806, "step": 6310 }, { "epoch": 0.0942369287511479, "grad_norm": 0.34375, "grad_norm_var": 0.0012456258138020834, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.445320963859558, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18041079491376877, "step": 6311 }, { "epoch": 0.0942518609217629, "grad_norm": 0.470703125, "grad_norm_var": 0.0015504042307535808, "learning_rate": 0.0001, "loss": 1.6898, "loss/crossentropy": 2.53310489654541, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.23666983842849731, "step": 6312 }, { "epoch": 0.09426679309237787, "grad_norm": 0.31640625, "grad_norm_var": 0.0017017205556233725, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.637665033340454, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.19635502994060516, "step": 6313 }, { "epoch": 0.09428172526299286, "grad_norm": 0.34375, "grad_norm_var": 0.0016409397125244141, "learning_rate": 0.0001, "loss": 1.4459, "loss/crossentropy": 2.550228476524353, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18027859181165695, "step": 6314 }, { "epoch": 0.09429665743360784, "grad_norm": 0.349609375, "grad_norm_var": 0.0016481876373291016, "learning_rate": 0.0001, "loss": 1.3811, "loss/crossentropy": 2.597057580947876, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.15848535299301147, "step": 6315 }, { "epoch": 0.09431158960422281, "grad_norm": 0.34765625, "grad_norm_var": 0.001666116714477539, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.674976944923401, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19039606302976608, "step": 6316 }, { "epoch": 0.0943265217748378, "grad_norm": 0.39453125, "grad_norm_var": 0.0015865166982014973, "learning_rate": 0.0001, "loss": 1.6128, "loss/crossentropy": 2.404433846473694, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.24565020203590393, "step": 6317 }, { "epoch": 0.09434145394545278, "grad_norm": 0.439453125, "grad_norm_var": 0.0019115289052327475, "learning_rate": 0.0001, "loss": 1.7398, "loss/crossentropy": 2.5344194173812866, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.22808732837438583, "step": 6318 }, { "epoch": 0.09435638611606777, "grad_norm": 0.330078125, "grad_norm_var": 0.0018661340077718099, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.617222785949707, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1949390172958374, "step": 6319 }, { "epoch": 0.09437131828668274, "grad_norm": 0.35546875, "grad_norm_var": 0.0018635431925455729, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.7459139823913574, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.1606845110654831, "step": 6320 }, { "epoch": 0.09438625045729772, "grad_norm": 0.474609375, "grad_norm_var": 0.0025555769602457683, "learning_rate": 0.0001, "loss": 1.6087, "loss/crossentropy": 2.574242115020752, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.24545687437057495, "step": 6321 }, { "epoch": 0.09440118262791271, "grad_norm": 0.365234375, "grad_norm_var": 0.0025591532389322917, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.326180100440979, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20410490036010742, "step": 6322 }, { "epoch": 0.09441611479852768, "grad_norm": 0.34765625, "grad_norm_var": 0.0025848229726155598, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.7929824590682983, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1714651733636856, "step": 6323 }, { "epoch": 0.09443104696914267, "grad_norm": 0.3671875, "grad_norm_var": 0.0023699283599853517, "learning_rate": 0.0001, "loss": 1.3652, "loss/crossentropy": 2.6799696683883667, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1699347198009491, "step": 6324 }, { "epoch": 0.09444597913975765, "grad_norm": 0.36328125, "grad_norm_var": 0.0022979100545247396, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.6864192485809326, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1993648186326027, "step": 6325 }, { "epoch": 0.09446091131037264, "grad_norm": 0.474609375, "grad_norm_var": 0.002887837092081706, "learning_rate": 0.0001, "loss": 1.9104, "loss/crossentropy": 2.5040892362594604, "loss/fcd": 1.6328125, "loss/idx": 10.0, "loss/logits": 0.27757084369659424, "step": 6326 }, { "epoch": 0.09447584348098761, "grad_norm": 0.35546875, "grad_norm_var": 0.002839390436808268, "learning_rate": 0.0001, "loss": 1.4934, "loss/crossentropy": 2.853330612182617, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20823712646961212, "step": 6327 }, { "epoch": 0.09449077565160259, "grad_norm": 0.41015625, "grad_norm_var": 0.002344195048014323, "learning_rate": 0.0001, "loss": 1.9291, "loss/crossentropy": 2.5312403440475464, "loss/fcd": 1.5703125, "loss/idx": 10.0, "loss/logits": 0.3587787598371506, "step": 6328 }, { "epoch": 0.09450570782221758, "grad_norm": 0.361328125, "grad_norm_var": 0.0021062056223551434, "learning_rate": 0.0001, "loss": 1.4244, "loss/crossentropy": 2.672628402709961, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17828767001628876, "step": 6329 }, { "epoch": 0.09452063999283256, "grad_norm": 0.359375, "grad_norm_var": 0.002045933405558268, "learning_rate": 0.0001, "loss": 1.587, "loss/crossentropy": 2.503372311592102, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.23155054450035095, "step": 6330 }, { "epoch": 0.09453557216344755, "grad_norm": 0.384765625, "grad_norm_var": 0.001976124445597331, "learning_rate": 0.0001, "loss": 1.4582, "loss/crossentropy": 2.766295313835144, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19260015338659286, "step": 6331 }, { "epoch": 0.09455050433406252, "grad_norm": 0.416015625, "grad_norm_var": 0.0019444147745768228, "learning_rate": 0.0001, "loss": 1.561, "loss/crossentropy": 2.8435717821121216, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22506800293922424, "step": 6332 }, { "epoch": 0.0945654365046775, "grad_norm": 0.408203125, "grad_norm_var": 0.001969003677368164, "learning_rate": 0.0001, "loss": 1.6498, "loss/crossentropy": 2.580525517463684, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.24351955205202103, "step": 6333 }, { "epoch": 0.09458036867529249, "grad_norm": 0.396484375, "grad_norm_var": 0.00179136594136556, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.320794105529785, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20081020891666412, "step": 6334 }, { "epoch": 0.09459530084590746, "grad_norm": 0.333984375, "grad_norm_var": 0.0017633914947509765, "learning_rate": 0.0001, "loss": 1.3605, "loss/crossentropy": 2.6582834720611572, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.1690548062324524, "step": 6335 }, { "epoch": 0.09461023301652245, "grad_norm": 0.384765625, "grad_norm_var": 0.00169830322265625, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.6166820526123047, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21169540286064148, "step": 6336 }, { "epoch": 0.09462516518713743, "grad_norm": 0.38671875, "grad_norm_var": 0.001162576675415039, "learning_rate": 0.0001, "loss": 1.5612, "loss/crossentropy": 2.5051249265670776, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.20966903120279312, "step": 6337 }, { "epoch": 0.0946400973577524, "grad_norm": 0.341796875, "grad_norm_var": 0.0012499332427978516, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.563847541809082, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17594169080257416, "step": 6338 }, { "epoch": 0.0946550295283674, "grad_norm": 0.3359375, "grad_norm_var": 0.0013102054595947265, "learning_rate": 0.0001, "loss": 1.348, "loss/crossentropy": 2.6795825958251953, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.16826383024454117, "step": 6339 }, { "epoch": 0.09466996169898237, "grad_norm": 0.34765625, "grad_norm_var": 0.0013674259185791015, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.6515793800354004, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1994618922472, "step": 6340 }, { "epoch": 0.09468489386959736, "grad_norm": 0.306640625, "grad_norm_var": 0.0016850153605143228, "learning_rate": 0.0001, "loss": 1.3541, "loss/crossentropy": 2.678080201148987, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.17442107200622559, "step": 6341 }, { "epoch": 0.09469982604021233, "grad_norm": 0.3671875, "grad_norm_var": 0.0009830315907796224, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.6785510778427124, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17810840904712677, "step": 6342 }, { "epoch": 0.09471475821082731, "grad_norm": 0.392578125, "grad_norm_var": 0.001004473368326823, "learning_rate": 0.0001, "loss": 1.5507, "loss/crossentropy": 2.749709367752075, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.2108929380774498, "step": 6343 }, { "epoch": 0.0947296903814423, "grad_norm": 0.48828125, "grad_norm_var": 0.0017953872680664062, "learning_rate": 0.0001, "loss": 1.6357, "loss/crossentropy": 2.578016996383667, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.23730586469173431, "step": 6344 }, { "epoch": 0.09474462255205728, "grad_norm": 0.3515625, "grad_norm_var": 0.0018201033274332682, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.546024799346924, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1985112726688385, "step": 6345 }, { "epoch": 0.09475955472267227, "grad_norm": 0.310546875, "grad_norm_var": 0.002071634928385417, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.574641704559326, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16942735761404037, "step": 6346 }, { "epoch": 0.09477448689328724, "grad_norm": 0.34375, "grad_norm_var": 0.0021073500315348308, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.2997416257858276, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18628284335136414, "step": 6347 }, { "epoch": 0.09478941906390223, "grad_norm": 0.369140625, "grad_norm_var": 0.0019539992014567056, "learning_rate": 0.0001, "loss": 1.4439, "loss/crossentropy": 2.5651360750198364, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1899954304099083, "step": 6348 }, { "epoch": 0.0948043512345172, "grad_norm": 0.365234375, "grad_norm_var": 0.001830911636352539, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.712257742881775, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1813601851463318, "step": 6349 }, { "epoch": 0.09481928340513218, "grad_norm": 0.322265625, "grad_norm_var": 0.0018526554107666016, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.775466203689575, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.16608544439077377, "step": 6350 }, { "epoch": 0.09483421557574717, "grad_norm": 0.326171875, "grad_norm_var": 0.001882791519165039, "learning_rate": 0.0001, "loss": 1.3766, "loss/crossentropy": 2.651970863342285, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17736178636550903, "step": 6351 }, { "epoch": 0.09484914774636215, "grad_norm": 0.388671875, "grad_norm_var": 0.001897287368774414, "learning_rate": 0.0001, "loss": 1.6382, "loss/crossentropy": 2.663511633872986, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.22019247710704803, "step": 6352 }, { "epoch": 0.09486407991697714, "grad_norm": 0.369140625, "grad_norm_var": 0.001851654052734375, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.7158206701278687, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20855619758367538, "step": 6353 }, { "epoch": 0.09487901208759211, "grad_norm": 0.330078125, "grad_norm_var": 0.0018854141235351562, "learning_rate": 0.0001, "loss": 1.4249, "loss/crossentropy": 2.6210190057754517, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18661265075206757, "step": 6354 }, { "epoch": 0.09489394425820709, "grad_norm": 0.390625, "grad_norm_var": 0.0019174575805664062, "learning_rate": 0.0001, "loss": 1.5815, "loss/crossentropy": 2.8913503885269165, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22598177194595337, "step": 6355 }, { "epoch": 0.09490887642882208, "grad_norm": 0.359375, "grad_norm_var": 0.00190582275390625, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.6943410634994507, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.2026655450463295, "step": 6356 }, { "epoch": 0.09492380859943705, "grad_norm": 0.431640625, "grad_norm_var": 0.0019709269205729165, "learning_rate": 0.0001, "loss": 1.6258, "loss/crossentropy": 2.71211040019989, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.22734922170639038, "step": 6357 }, { "epoch": 0.09493874077005204, "grad_norm": 0.330078125, "grad_norm_var": 0.0020666599273681642, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.6311997175216675, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20203042030334473, "step": 6358 }, { "epoch": 0.09495367294066702, "grad_norm": 0.40234375, "grad_norm_var": 0.002106157938639323, "learning_rate": 0.0001, "loss": 1.6591, "loss/crossentropy": 2.475130558013916, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.2332841008901596, "step": 6359 }, { "epoch": 0.094968605111282, "grad_norm": 0.314453125, "grad_norm_var": 0.0011937300364176433, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.6745002269744873, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20262730866670609, "step": 6360 }, { "epoch": 0.09498353728189698, "grad_norm": 0.388671875, "grad_norm_var": 0.001255035400390625, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.4354140758514404, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19269870966672897, "step": 6361 }, { "epoch": 0.09499846945251196, "grad_norm": 0.369140625, "grad_norm_var": 0.0010919570922851562, "learning_rate": 0.0001, "loss": 1.6665, "loss/crossentropy": 2.532801628112793, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.25242486596107483, "step": 6362 }, { "epoch": 0.09501340162312695, "grad_norm": 0.349609375, "grad_norm_var": 0.001079416275024414, "learning_rate": 0.0001, "loss": 1.3706, "loss/crossentropy": 2.5822103023529053, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16352476924657822, "step": 6363 }, { "epoch": 0.09502833379374193, "grad_norm": 0.3984375, "grad_norm_var": 0.001157379150390625, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.6306647062301636, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20628470182418823, "step": 6364 }, { "epoch": 0.09504326596435692, "grad_norm": 0.400390625, "grad_norm_var": 0.0012369155883789062, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.737868309020996, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.23462244123220444, "step": 6365 }, { "epoch": 0.09505819813497189, "grad_norm": 0.419921875, "grad_norm_var": 0.001251220703125, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.695321202278137, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19764024019241333, "step": 6366 }, { "epoch": 0.09507313030558687, "grad_norm": 0.37109375, "grad_norm_var": 0.0010965824127197265, "learning_rate": 0.0001, "loss": 1.534, "loss/crossentropy": 2.6471340656280518, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20976266264915466, "step": 6367 }, { "epoch": 0.09508806247620186, "grad_norm": 0.34375, "grad_norm_var": 0.00114593505859375, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.553065061569214, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18321334570646286, "step": 6368 }, { "epoch": 0.09510299464681683, "grad_norm": 0.57421875, "grad_norm_var": 0.0036676883697509765, "learning_rate": 0.0001, "loss": 1.8846, "loss/crossentropy": 2.6773409843444824, "loss/fcd": 1.55078125, "loss/idx": 10.0, "loss/logits": 0.3338579088449478, "step": 6369 }, { "epoch": 0.09511792681743182, "grad_norm": 0.326171875, "grad_norm_var": 0.003697697321573893, "learning_rate": 0.0001, "loss": 1.5196, "loss/crossentropy": 2.408528447151184, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2070959210395813, "step": 6370 }, { "epoch": 0.0951328589880468, "grad_norm": 0.330078125, "grad_norm_var": 0.00388641357421875, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.6686776876449585, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.1695995330810547, "step": 6371 }, { "epoch": 0.09514779115866177, "grad_norm": 0.53125, "grad_norm_var": 0.005217997233072916, "learning_rate": 0.0001, "loss": 1.7171, "loss/crossentropy": 2.6512328386306763, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2835410609841347, "step": 6372 }, { "epoch": 0.09516272332927676, "grad_norm": 0.333984375, "grad_norm_var": 0.0053054173787434895, "learning_rate": 0.0001, "loss": 1.3363, "loss/crossentropy": 2.6025805473327637, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.1565856635570526, "step": 6373 }, { "epoch": 0.09517765549989174, "grad_norm": 0.38671875, "grad_norm_var": 0.005080016454060873, "learning_rate": 0.0001, "loss": 1.31, "loss/crossentropy": 2.702047109603882, "loss/fcd": 1.14453125, "loss/idx": 10.0, "loss/logits": 0.1654503494501114, "step": 6374 }, { "epoch": 0.09519258767050673, "grad_norm": 0.345703125, "grad_norm_var": 0.005187416076660156, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.5849944353103638, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1752263456583023, "step": 6375 }, { "epoch": 0.0952075198411217, "grad_norm": 0.384765625, "grad_norm_var": 0.004821205139160156, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.4146928787231445, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20297424495220184, "step": 6376 }, { "epoch": 0.09522245201173668, "grad_norm": 0.37890625, "grad_norm_var": 0.004830026626586914, "learning_rate": 0.0001, "loss": 1.7007, "loss/crossentropy": 2.28320050239563, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.24756087362766266, "step": 6377 }, { "epoch": 0.09523738418235167, "grad_norm": 0.3984375, "grad_norm_var": 0.004801177978515625, "learning_rate": 0.0001, "loss": 1.5428, "loss/crossentropy": 2.695011019706726, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20690211653709412, "step": 6378 }, { "epoch": 0.09525231635296665, "grad_norm": 0.431640625, "grad_norm_var": 0.004757118225097656, "learning_rate": 0.0001, "loss": 1.5737, "loss/crossentropy": 2.5416144132614136, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20649129897356033, "step": 6379 }, { "epoch": 0.09526724852358164, "grad_norm": 0.416015625, "grad_norm_var": 0.004779291152954101, "learning_rate": 0.0001, "loss": 1.4312, "loss/crossentropy": 2.4282549619674683, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.16945239901542664, "step": 6380 }, { "epoch": 0.09528218069419661, "grad_norm": 0.54296875, "grad_norm_var": 0.006089274088541667, "learning_rate": 0.0001, "loss": 1.7336, "loss/crossentropy": 2.429960012435913, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.26098646223545074, "step": 6381 }, { "epoch": 0.09529711286481159, "grad_norm": 0.82421875, "grad_norm_var": 0.016989628473917644, "learning_rate": 0.0001, "loss": 2.1402, "loss/crossentropy": 2.353924036026001, "loss/fcd": 1.8359375, "loss/idx": 10.0, "loss/logits": 0.3042386993765831, "step": 6382 }, { "epoch": 0.09531204503542658, "grad_norm": 0.376953125, "grad_norm_var": 0.01694380442301432, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.5381596088409424, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1864159256219864, "step": 6383 }, { "epoch": 0.09532697720604155, "grad_norm": 0.369140625, "grad_norm_var": 0.01668241818745931, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.7894911766052246, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19498362392187119, "step": 6384 }, { "epoch": 0.09534190937665654, "grad_norm": 0.48046875, "grad_norm_var": 0.01548460324605306, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.5854309797286987, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21176480501890182, "step": 6385 }, { "epoch": 0.09535684154727152, "grad_norm": 0.435546875, "grad_norm_var": 0.014738702774047851, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.395493745803833, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.19105592370033264, "step": 6386 }, { "epoch": 0.09537177371788651, "grad_norm": 0.380859375, "grad_norm_var": 0.014186588923136394, "learning_rate": 0.0001, "loss": 1.4658, "loss/crossentropy": 2.66443133354187, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1806832253932953, "step": 6387 }, { "epoch": 0.09538670588850148, "grad_norm": 0.357421875, "grad_norm_var": 0.013927714029947916, "learning_rate": 0.0001, "loss": 1.354, "loss/crossentropy": 2.491447925567627, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.15863795578479767, "step": 6388 }, { "epoch": 0.09540163805911646, "grad_norm": 0.34765625, "grad_norm_var": 0.013768498102823894, "learning_rate": 0.0001, "loss": 1.367, "loss/crossentropy": 2.5512884855270386, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17558763176202774, "step": 6389 }, { "epoch": 0.09541657022973145, "grad_norm": 0.326171875, "grad_norm_var": 0.01433563232421875, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.7246347665786743, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18999885767698288, "step": 6390 }, { "epoch": 0.09543150240034642, "grad_norm": 0.390625, "grad_norm_var": 0.013987970352172852, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.5821146965026855, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.1653827279806137, "step": 6391 }, { "epoch": 0.09544643457096141, "grad_norm": 0.353515625, "grad_norm_var": 0.014227533340454101, "learning_rate": 0.0001, "loss": 1.3434, "loss/crossentropy": 2.644243359565735, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.16758184880018234, "step": 6392 }, { "epoch": 0.09546136674157639, "grad_norm": 0.73828125, "grad_norm_var": 0.02005918820699056, "learning_rate": 0.0001, "loss": 1.6841, "loss/crossentropy": 2.5813411474227905, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.262179896235466, "step": 6393 }, { "epoch": 0.09547629891219137, "grad_norm": 0.3515625, "grad_norm_var": 0.020507033665974936, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.531742572784424, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19520027935504913, "step": 6394 }, { "epoch": 0.09549123108280635, "grad_norm": 0.3828125, "grad_norm_var": 0.02074426015218099, "learning_rate": 0.0001, "loss": 1.3665, "loss/crossentropy": 2.7853610515594482, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.1595171093940735, "step": 6395 }, { "epoch": 0.09550616325342133, "grad_norm": 0.337890625, "grad_norm_var": 0.021397844950358073, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.559072494506836, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.16277434676885605, "step": 6396 }, { "epoch": 0.09552109542403632, "grad_norm": 0.380859375, "grad_norm_var": 0.02075537045796712, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.9054031372070312, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17793869972229004, "step": 6397 }, { "epoch": 0.0955360275946513, "grad_norm": 0.330078125, "grad_norm_var": 0.009853553771972657, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.5766490697860718, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.19212424755096436, "step": 6398 }, { "epoch": 0.09555095976526627, "grad_norm": 0.39453125, "grad_norm_var": 0.009827661514282226, "learning_rate": 0.0001, "loss": 1.6863, "loss/crossentropy": 2.3815237283706665, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.22924189269542694, "step": 6399 }, { "epoch": 0.09556589193588126, "grad_norm": 0.35546875, "grad_norm_var": 0.0098907470703125, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.3763684034347534, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1818627193570137, "step": 6400 }, { "epoch": 0.09558082410649624, "grad_norm": 0.4296875, "grad_norm_var": 0.009483273824055989, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 2.5306193828582764, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20500430464744568, "step": 6401 }, { "epoch": 0.09559575627711123, "grad_norm": 0.34765625, "grad_norm_var": 0.009471114476521809, "learning_rate": 0.0001, "loss": 1.5409, "loss/crossentropy": 2.6569992303848267, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21277859807014465, "step": 6402 }, { "epoch": 0.0956106884477262, "grad_norm": 0.357421875, "grad_norm_var": 0.009527190526326498, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.540727734565735, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1939421370625496, "step": 6403 }, { "epoch": 0.09562562061834118, "grad_norm": 0.306640625, "grad_norm_var": 0.009884246190388997, "learning_rate": 0.0001, "loss": 1.3698, "loss/crossentropy": 2.6988028287887573, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17446671426296234, "step": 6404 }, { "epoch": 0.09564055278895617, "grad_norm": 0.3359375, "grad_norm_var": 0.009948333104451498, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.608405828475952, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1942366287112236, "step": 6405 }, { "epoch": 0.09565548495957114, "grad_norm": 0.380859375, "grad_norm_var": 0.009724919001261394, "learning_rate": 0.0001, "loss": 1.4809, "loss/crossentropy": 2.5598710775375366, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19960789382457733, "step": 6406 }, { "epoch": 0.09567041713018613, "grad_norm": 0.314453125, "grad_norm_var": 0.010039202372233073, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.7360512018203735, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.17062294483184814, "step": 6407 }, { "epoch": 0.09568534930080111, "grad_norm": 0.34765625, "grad_norm_var": 0.010062901178995769, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.7761224508285522, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.21780119836330414, "step": 6408 }, { "epoch": 0.0957002814714161, "grad_norm": 0.365234375, "grad_norm_var": 0.0009765625, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.674625277519226, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21018408238887787, "step": 6409 }, { "epoch": 0.09571521364203107, "grad_norm": 0.32421875, "grad_norm_var": 0.0010446548461914063, "learning_rate": 0.0001, "loss": 1.3851, "loss/crossentropy": 2.6194413900375366, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17413869500160217, "step": 6410 }, { "epoch": 0.09573014581264605, "grad_norm": 0.373046875, "grad_norm_var": 0.0010153293609619141, "learning_rate": 0.0001, "loss": 1.6821, "loss/crossentropy": 2.510921835899353, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.24071750044822693, "step": 6411 }, { "epoch": 0.09574507798326104, "grad_norm": 0.318359375, "grad_norm_var": 0.0010839939117431641, "learning_rate": 0.0001, "loss": 1.3713, "loss/crossentropy": 2.6102691888809204, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1720796376466751, "step": 6412 }, { "epoch": 0.09576001015387602, "grad_norm": 0.3828125, "grad_norm_var": 0.0010912577311197917, "learning_rate": 0.0001, "loss": 1.5822, "loss/crossentropy": 2.6382464170455933, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21115229278802872, "step": 6413 }, { "epoch": 0.095774942324491, "grad_norm": 0.34765625, "grad_norm_var": 0.0010544935862223308, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.681511402130127, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19188669323921204, "step": 6414 }, { "epoch": 0.09578987449510598, "grad_norm": 0.41796875, "grad_norm_var": 0.0012120405832926433, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.577709913253784, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22778702527284622, "step": 6415 }, { "epoch": 0.09580480666572096, "grad_norm": 0.3359375, "grad_norm_var": 0.0012387434641520183, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.4131991863250732, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18427760899066925, "step": 6416 }, { "epoch": 0.09581973883633595, "grad_norm": 0.353515625, "grad_norm_var": 0.0008463541666666667, "learning_rate": 0.0001, "loss": 1.491, "loss/crossentropy": 2.564847230911255, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2019009292125702, "step": 6417 }, { "epoch": 0.09583467100695092, "grad_norm": 0.45703125, "grad_norm_var": 0.0015513102213541667, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.5188854932785034, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.22481989860534668, "step": 6418 }, { "epoch": 0.09584960317756591, "grad_norm": 0.361328125, "grad_norm_var": 0.001552263895670573, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.477502465248108, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18968656659126282, "step": 6419 }, { "epoch": 0.09586453534818089, "grad_norm": 0.3828125, "grad_norm_var": 0.0013966719309488932, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.7417008876800537, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18334931135177612, "step": 6420 }, { "epoch": 0.09587946751879586, "grad_norm": 0.345703125, "grad_norm_var": 0.0013681411743164062, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.6624860763549805, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1995868980884552, "step": 6421 }, { "epoch": 0.09589439968941085, "grad_norm": 0.380859375, "grad_norm_var": 0.0013681411743164062, "learning_rate": 0.0001, "loss": 1.4917, "loss/crossentropy": 2.701035737991333, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20658308267593384, "step": 6422 }, { "epoch": 0.09590933186002583, "grad_norm": 0.439453125, "grad_norm_var": 0.0015349706013997396, "learning_rate": 0.0001, "loss": 1.6374, "loss/crossentropy": 2.637007236480713, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2232896164059639, "step": 6423 }, { "epoch": 0.09592426403064082, "grad_norm": 0.453125, "grad_norm_var": 0.0019040425618489583, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.4824419021606445, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.22995206713676453, "step": 6424 }, { "epoch": 0.0959391962012558, "grad_norm": 0.3828125, "grad_norm_var": 0.0018947442372639974, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.705451488494873, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19937970489263535, "step": 6425 }, { "epoch": 0.09595412837187078, "grad_norm": 0.4375, "grad_norm_var": 0.0018763065338134766, "learning_rate": 0.0001, "loss": 1.6162, "loss/crossentropy": 2.5261337757110596, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.21779412031173706, "step": 6426 }, { "epoch": 0.09596906054248576, "grad_norm": 0.33984375, "grad_norm_var": 0.002000872294108073, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.5983405113220215, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.18929073214530945, "step": 6427 }, { "epoch": 0.09598399271310074, "grad_norm": 0.419921875, "grad_norm_var": 0.001762835184733073, "learning_rate": 0.0001, "loss": 1.5503, "loss/crossentropy": 2.462694764137268, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.2222118303179741, "step": 6428 }, { "epoch": 0.09599892488371572, "grad_norm": 0.392578125, "grad_norm_var": 0.0017595767974853515, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.669346809387207, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1809145137667656, "step": 6429 }, { "epoch": 0.0960138570543307, "grad_norm": 0.5078125, "grad_norm_var": 0.0024477481842041016, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.8871508836746216, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1800559163093567, "step": 6430 }, { "epoch": 0.09602878922494569, "grad_norm": 0.4921875, "grad_norm_var": 0.00296476682027181, "learning_rate": 0.0001, "loss": 1.6122, "loss/crossentropy": 2.3671613931655884, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.1981288269162178, "step": 6431 }, { "epoch": 0.09604372139556067, "grad_norm": 0.435546875, "grad_norm_var": 0.0026656468709309895, "learning_rate": 0.0001, "loss": 1.5284, "loss/crossentropy": 2.4235517978668213, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19249820709228516, "step": 6432 }, { "epoch": 0.09605865356617564, "grad_norm": 0.380859375, "grad_norm_var": 0.0025014241536458334, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.777376413345337, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.16777987778186798, "step": 6433 }, { "epoch": 0.09607358573679063, "grad_norm": 0.50390625, "grad_norm_var": 0.0029134114583333334, "learning_rate": 0.0001, "loss": 1.5282, "loss/crossentropy": 2.4005472660064697, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20791320502758026, "step": 6434 }, { "epoch": 0.09608851790740561, "grad_norm": 0.39453125, "grad_norm_var": 0.002740208307902018, "learning_rate": 0.0001, "loss": 1.6412, "loss/crossentropy": 2.358132839202881, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.23883618414402008, "step": 6435 }, { "epoch": 0.0961034500780206, "grad_norm": 0.349609375, "grad_norm_var": 0.0029652913411458335, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 2.6770511865615845, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19216109067201614, "step": 6436 }, { "epoch": 0.09611838224863557, "grad_norm": 0.6875, "grad_norm_var": 0.007062514623006185, "learning_rate": 0.0001, "loss": 1.7604, "loss/crossentropy": 2.632333278656006, "loss/fcd": 1.515625, "loss/idx": 10.0, "loss/logits": 0.244820736348629, "step": 6437 }, { "epoch": 0.09613331441925055, "grad_norm": 0.416015625, "grad_norm_var": 0.006874831517537435, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.5820945501327515, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22004956752061844, "step": 6438 }, { "epoch": 0.09614824658986554, "grad_norm": 0.361328125, "grad_norm_var": 0.007257572809855143, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.492702007293701, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19268235564231873, "step": 6439 }, { "epoch": 0.09616317876048051, "grad_norm": 0.365234375, "grad_norm_var": 0.007524363199869792, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.7356384992599487, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2002328261733055, "step": 6440 }, { "epoch": 0.0961781109310955, "grad_norm": 0.349609375, "grad_norm_var": 0.007798624038696289, "learning_rate": 0.0001, "loss": 1.58, "loss/crossentropy": 2.8432204723358154, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.23231326043605804, "step": 6441 }, { "epoch": 0.09619304310171048, "grad_norm": 0.369140625, "grad_norm_var": 0.007996114095052083, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.8409736156463623, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17936547100543976, "step": 6442 }, { "epoch": 0.09620797527232545, "grad_norm": 0.34375, "grad_norm_var": 0.007953834533691407, "learning_rate": 0.0001, "loss": 1.4513, "loss/crossentropy": 2.695967197418213, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18962115049362183, "step": 6443 }, { "epoch": 0.09622290744294044, "grad_norm": 0.38671875, "grad_norm_var": 0.008036788304646809, "learning_rate": 0.0001, "loss": 1.6133, "loss/crossentropy": 2.345520496368408, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.21482716500759125, "step": 6444 }, { "epoch": 0.09623783961355542, "grad_norm": 0.5, "grad_norm_var": 0.008350626627604166, "learning_rate": 0.0001, "loss": 1.8108, "loss/crossentropy": 2.6000667810440063, "loss/fcd": 1.5390625, "loss/idx": 10.0, "loss/logits": 0.27168799936771393, "step": 6445 }, { "epoch": 0.09625277178417041, "grad_norm": 0.369140625, "grad_norm_var": 0.00807188351949056, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.5887285470962524, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2049407809972763, "step": 6446 }, { "epoch": 0.09626770395478539, "grad_norm": 0.33203125, "grad_norm_var": 0.008113590876261394, "learning_rate": 0.0001, "loss": 1.4868, "loss/crossentropy": 2.542932868003845, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.21333838999271393, "step": 6447 }, { "epoch": 0.09628263612540038, "grad_norm": 0.51171875, "grad_norm_var": 0.008745257059733074, "learning_rate": 0.0001, "loss": 1.6476, "loss/crossentropy": 2.955501914024353, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2335861697793007, "step": 6448 }, { "epoch": 0.09629756829601535, "grad_norm": 0.39453125, "grad_norm_var": 0.008696858088175457, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.3426766395568848, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.21845273673534393, "step": 6449 }, { "epoch": 0.09631250046663033, "grad_norm": 0.373046875, "grad_norm_var": 0.008210182189941406, "learning_rate": 0.0001, "loss": 1.4015, "loss/crossentropy": 2.61396586894989, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17490693926811218, "step": 6450 }, { "epoch": 0.09632743263724532, "grad_norm": 0.419921875, "grad_norm_var": 0.008209975560506184, "learning_rate": 0.0001, "loss": 1.6586, "loss/crossentropy": 2.8347946405410767, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.2562553733587265, "step": 6451 }, { "epoch": 0.09634236480786029, "grad_norm": 0.357421875, "grad_norm_var": 0.00815288225809733, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.625366687774658, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19317501038312912, "step": 6452 }, { "epoch": 0.09635729697847528, "grad_norm": 0.3203125, "grad_norm_var": 0.00292356808980306, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.574752688407898, "loss/fcd": 1.1640625, "loss/idx": 10.0, "loss/logits": 0.15691353380680084, "step": 6453 }, { "epoch": 0.09637222914909026, "grad_norm": 0.373046875, "grad_norm_var": 0.0028648217519124347, "learning_rate": 0.0001, "loss": 1.5607, "loss/crossentropy": 2.7596991062164307, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22863751649856567, "step": 6454 }, { "epoch": 0.09638716131970523, "grad_norm": 0.41015625, "grad_norm_var": 0.0028731664021809894, "learning_rate": 0.0001, "loss": 1.5782, "loss/crossentropy": 2.702526092529297, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21495915949344635, "step": 6455 }, { "epoch": 0.09640209349032022, "grad_norm": 0.4140625, "grad_norm_var": 0.0028870741526285807, "learning_rate": 0.0001, "loss": 1.724, "loss/crossentropy": 2.488826274871826, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.23963160812854767, "step": 6456 }, { "epoch": 0.0964170256609352, "grad_norm": 2.09375, "grad_norm_var": 0.18384450276692707, "learning_rate": 0.0001, "loss": 2.1169, "loss/crossentropy": 2.8208110332489014, "loss/fcd": 1.78125, "loss/idx": 10.0, "loss/logits": 0.33563902974128723, "step": 6457 }, { "epoch": 0.09643195783155019, "grad_norm": 0.388671875, "grad_norm_var": 0.18353265126546223, "learning_rate": 0.0001, "loss": 1.8026, "loss/crossentropy": 2.420962333679199, "loss/fcd": 1.53125, "loss/idx": 10.0, "loss/logits": 0.27131757140159607, "step": 6458 }, { "epoch": 0.09644689000216516, "grad_norm": 0.416015625, "grad_norm_var": 0.182360569636027, "learning_rate": 0.0001, "loss": 1.509, "loss/crossentropy": 2.5570809841156006, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19646774232387543, "step": 6459 }, { "epoch": 0.09646182217278014, "grad_norm": 0.306640625, "grad_norm_var": 0.18401126861572265, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.8071212768554688, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18375811725854874, "step": 6460 }, { "epoch": 0.09647675434339513, "grad_norm": 0.390625, "grad_norm_var": 0.18474114735921224, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.5885348320007324, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18285094946622849, "step": 6461 }, { "epoch": 0.0964916865140101, "grad_norm": 0.32421875, "grad_norm_var": 0.18560280799865722, "learning_rate": 0.0001, "loss": 1.5518, "loss/crossentropy": 2.7480058670043945, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21587789058685303, "step": 6462 }, { "epoch": 0.0965066186846251, "grad_norm": 0.42578125, "grad_norm_var": 0.1841883182525635, "learning_rate": 0.0001, "loss": 1.7817, "loss/crossentropy": 2.1080941557884216, "loss/fcd": 1.52734375, "loss/idx": 10.0, "loss/logits": 0.25440147519111633, "step": 6463 }, { "epoch": 0.09652155085524007, "grad_norm": 0.59375, "grad_norm_var": 0.18479180335998535, "learning_rate": 0.0001, "loss": 1.6924, "loss/crossentropy": 2.4904619455337524, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.23539189994335175, "step": 6464 }, { "epoch": 0.09653648302585505, "grad_norm": 0.34375, "grad_norm_var": 0.18566791216532388, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.4652334451675415, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19915590435266495, "step": 6465 }, { "epoch": 0.09655141519647004, "grad_norm": 0.38671875, "grad_norm_var": 0.185453732808431, "learning_rate": 0.0001, "loss": 1.6668, "loss/crossentropy": 2.3437373638153076, "loss/fcd": 1.453125, "loss/idx": 10.0, "loss/logits": 0.2137007713317871, "step": 6466 }, { "epoch": 0.09656634736708501, "grad_norm": 0.33984375, "grad_norm_var": 0.1866860548655192, "learning_rate": 0.0001, "loss": 1.3188, "loss/crossentropy": 2.464591383934021, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.1586906909942627, "step": 6467 }, { "epoch": 0.0965812795377, "grad_norm": 0.34375, "grad_norm_var": 0.18694451649983723, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.521895408630371, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1766013354063034, "step": 6468 }, { "epoch": 0.09659621170831498, "grad_norm": 0.384765625, "grad_norm_var": 0.18572920163472492, "learning_rate": 0.0001, "loss": 1.6146, "loss/crossentropy": 2.4120094776153564, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.20054514706134796, "step": 6469 }, { "epoch": 0.09661114387892997, "grad_norm": 0.357421875, "grad_norm_var": 0.1860005537668864, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.758628010749817, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.23016710579395294, "step": 6470 }, { "epoch": 0.09662607604954494, "grad_norm": 0.46484375, "grad_norm_var": 0.18556885719299315, "learning_rate": 0.0001, "loss": 1.6654, "loss/crossentropy": 2.6971622705459595, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2434849888086319, "step": 6471 }, { "epoch": 0.09664100822015992, "grad_norm": 0.359375, "grad_norm_var": 0.18637083371480306, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.6345417499542236, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19071245193481445, "step": 6472 }, { "epoch": 0.09665594039077491, "grad_norm": 0.345703125, "grad_norm_var": 0.004723612467447917, "learning_rate": 0.0001, "loss": 1.5712, "loss/crossentropy": 2.6663565635681152, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22353725880384445, "step": 6473 }, { "epoch": 0.09667087256138988, "grad_norm": 0.361328125, "grad_norm_var": 0.004759661356608073, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.427812933921814, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21714814007282257, "step": 6474 }, { "epoch": 0.09668580473200487, "grad_norm": 0.380859375, "grad_norm_var": 0.004686991373697917, "learning_rate": 0.0001, "loss": 1.6, "loss/crossentropy": 2.5815032720565796, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.21327417343854904, "step": 6475 }, { "epoch": 0.09670073690261985, "grad_norm": 0.349609375, "grad_norm_var": 0.00437157948811849, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.6269898414611816, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.21486805379390717, "step": 6476 }, { "epoch": 0.09671566907323483, "grad_norm": 0.365234375, "grad_norm_var": 0.004391209284464518, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.7108794450759888, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17005956172943115, "step": 6477 }, { "epoch": 0.09673060124384981, "grad_norm": 0.369140625, "grad_norm_var": 0.0041656494140625, "learning_rate": 0.0001, "loss": 1.5362, "loss/crossentropy": 2.415353536605835, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20420613884925842, "step": 6478 }, { "epoch": 0.09674553341446479, "grad_norm": 0.361328125, "grad_norm_var": 0.004081201553344726, "learning_rate": 0.0001, "loss": 1.5666, "loss/crossentropy": 2.4758474826812744, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22281081229448318, "step": 6479 }, { "epoch": 0.09676046558507978, "grad_norm": 0.36328125, "grad_norm_var": 0.0008852481842041016, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.6102776527404785, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20269117504358292, "step": 6480 }, { "epoch": 0.09677539775569476, "grad_norm": 0.341796875, "grad_norm_var": 0.0008916219075520833, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.45003604888916, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20490968227386475, "step": 6481 }, { "epoch": 0.09679032992630973, "grad_norm": 0.61328125, "grad_norm_var": 0.004689788818359375, "learning_rate": 0.0001, "loss": 1.6775, "loss/crossentropy": 2.5029616355895996, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.2204231321811676, "step": 6482 }, { "epoch": 0.09680526209692472, "grad_norm": 0.345703125, "grad_norm_var": 0.004659509658813477, "learning_rate": 0.0001, "loss": 1.4499, "loss/crossentropy": 2.6155474185943604, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19600551575422287, "step": 6483 }, { "epoch": 0.0968201942675397, "grad_norm": 0.328125, "grad_norm_var": 0.004753859837849935, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.420276165008545, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17893141508102417, "step": 6484 }, { "epoch": 0.09683512643815469, "grad_norm": 0.4296875, "grad_norm_var": 0.004904111226399739, "learning_rate": 0.0001, "loss": 1.7392, "loss/crossentropy": 2.5418715476989746, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.262629471719265, "step": 6485 }, { "epoch": 0.09685005860876966, "grad_norm": 0.345703125, "grad_norm_var": 0.004953511555989583, "learning_rate": 0.0001, "loss": 1.6391, "loss/crossentropy": 2.147402346134186, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.2328650951385498, "step": 6486 }, { "epoch": 0.09686499077938465, "grad_norm": 0.41796875, "grad_norm_var": 0.004578145345052084, "learning_rate": 0.0001, "loss": 1.4712, "loss/crossentropy": 2.266266703605652, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.16648923605680466, "step": 6487 }, { "epoch": 0.09687992294999963, "grad_norm": 0.37890625, "grad_norm_var": 0.00454858144124349, "learning_rate": 0.0001, "loss": 1.4769, "loss/crossentropy": 2.5318537950515747, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.18389695137739182, "step": 6488 }, { "epoch": 0.0968948551206146, "grad_norm": 0.3984375, "grad_norm_var": 0.004473479588826498, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.5446972846984863, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19826176762580872, "step": 6489 }, { "epoch": 0.0969097872912296, "grad_norm": 0.34765625, "grad_norm_var": 0.0045272191365559895, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.703696370124817, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1790645718574524, "step": 6490 }, { "epoch": 0.09692471946184457, "grad_norm": 0.33203125, "grad_norm_var": 0.00469371477762858, "learning_rate": 0.0001, "loss": 1.406, "loss/crossentropy": 2.4763048887252808, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17556780576705933, "step": 6491 }, { "epoch": 0.09693965163245956, "grad_norm": 0.33984375, "grad_norm_var": 0.004739888509114583, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.5094884634017944, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1913837566971779, "step": 6492 }, { "epoch": 0.09695458380307453, "grad_norm": 0.3359375, "grad_norm_var": 0.00485075314839681, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.5404165983200073, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1830393299460411, "step": 6493 }, { "epoch": 0.09696951597368951, "grad_norm": 0.330078125, "grad_norm_var": 0.004992532730102539, "learning_rate": 0.0001, "loss": 1.4618, "loss/crossentropy": 2.586045503616333, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20007067918777466, "step": 6494 }, { "epoch": 0.0969844481443045, "grad_norm": 0.3828125, "grad_norm_var": 0.00498046875, "learning_rate": 0.0001, "loss": 1.5102, "loss/crossentropy": 2.6500829458236694, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2132793366909027, "step": 6495 }, { "epoch": 0.09699938031491948, "grad_norm": 0.35546875, "grad_norm_var": 0.004998524983723958, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.5485817193984985, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20153313875198364, "step": 6496 }, { "epoch": 0.09701431248553447, "grad_norm": 0.345703125, "grad_norm_var": 0.004981422424316406, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.703535795211792, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17559673637151718, "step": 6497 }, { "epoch": 0.09702924465614944, "grad_norm": 0.337890625, "grad_norm_var": 0.0010347843170166015, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.5546904802322388, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19089334458112717, "step": 6498 }, { "epoch": 0.09704417682676442, "grad_norm": 0.390625, "grad_norm_var": 0.0010782877604166667, "learning_rate": 0.0001, "loss": 1.5998, "loss/crossentropy": 2.522827386856079, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22087042778730392, "step": 6499 }, { "epoch": 0.0970591089973794, "grad_norm": 0.375, "grad_norm_var": 0.0010019938151041667, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 2.6262853145599365, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18525108695030212, "step": 6500 }, { "epoch": 0.09707404116799438, "grad_norm": 0.3828125, "grad_norm_var": 0.0007364908854166667, "learning_rate": 0.0001, "loss": 1.5836, "loss/crossentropy": 2.682006001472473, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20862165838479996, "step": 6501 }, { "epoch": 0.09708897333860937, "grad_norm": 0.37890625, "grad_norm_var": 0.0007318973541259766, "learning_rate": 0.0001, "loss": 1.4795, "loss/crossentropy": 2.782347559928894, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.2021266371011734, "step": 6502 }, { "epoch": 0.09710390550922435, "grad_norm": 0.365234375, "grad_norm_var": 0.0005289077758789063, "learning_rate": 0.0001, "loss": 1.7239, "loss/crossentropy": 2.712070941925049, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.2551772817969322, "step": 6503 }, { "epoch": 0.09711883767983932, "grad_norm": 0.322265625, "grad_norm_var": 0.0005948225657145183, "learning_rate": 0.0001, "loss": 1.3827, "loss/crossentropy": 2.604658007621765, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.1795804649591446, "step": 6504 }, { "epoch": 0.09713376985045431, "grad_norm": 0.353515625, "grad_norm_var": 0.0004760106404622396, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.59336519241333, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.169879250228405, "step": 6505 }, { "epoch": 0.09714870202106929, "grad_norm": 0.314453125, "grad_norm_var": 0.0005762577056884766, "learning_rate": 0.0001, "loss": 1.4395, "loss/crossentropy": 2.5076541900634766, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18947239220142365, "step": 6506 }, { "epoch": 0.09716363419168428, "grad_norm": 0.45703125, "grad_norm_var": 0.0012089888254801433, "learning_rate": 0.0001, "loss": 1.7234, "loss/crossentropy": 2.65135657787323, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.23904704302549362, "step": 6507 }, { "epoch": 0.09717856636229925, "grad_norm": 0.396484375, "grad_norm_var": 0.0012537002563476562, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.5841904878616333, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23761653900146484, "step": 6508 }, { "epoch": 0.09719349853291424, "grad_norm": 0.349609375, "grad_norm_var": 0.0012142022450764975, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.5183998346328735, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20180821418762207, "step": 6509 }, { "epoch": 0.09720843070352922, "grad_norm": 0.330078125, "grad_norm_var": 0.0012142022450764975, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.602653980255127, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18361754715442657, "step": 6510 }, { "epoch": 0.0972233628741442, "grad_norm": 0.373046875, "grad_norm_var": 0.0011967976888020834, "learning_rate": 0.0001, "loss": 1.6128, "loss/crossentropy": 2.473088026046753, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.2260531708598137, "step": 6511 }, { "epoch": 0.09723829504475918, "grad_norm": 0.302734375, "grad_norm_var": 0.0014324029286702474, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.7142045497894287, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.1801202893257141, "step": 6512 }, { "epoch": 0.09725322721537416, "grad_norm": 0.32421875, "grad_norm_var": 0.0015049616495768229, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.6489299535751343, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.2236582487821579, "step": 6513 }, { "epoch": 0.09726815938598915, "grad_norm": 0.373046875, "grad_norm_var": 0.0014803568522135417, "learning_rate": 0.0001, "loss": 1.4569, "loss/crossentropy": 2.523000717163086, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1990988701581955, "step": 6514 }, { "epoch": 0.09728309155660413, "grad_norm": 0.32421875, "grad_norm_var": 0.0015008926391601562, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.4765830039978027, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19252432137727737, "step": 6515 }, { "epoch": 0.0972980237272191, "grad_norm": 0.3515625, "grad_norm_var": 0.0014810562133789062, "learning_rate": 0.0001, "loss": 1.3441, "loss/crossentropy": 2.59154212474823, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1682998389005661, "step": 6516 }, { "epoch": 0.09731295589783409, "grad_norm": 0.3359375, "grad_norm_var": 0.0014520645141601562, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.5001556873321533, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20737352967262268, "step": 6517 }, { "epoch": 0.09732788806844907, "grad_norm": 0.31640625, "grad_norm_var": 0.0014825820922851562, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.608230233192444, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17684362083673477, "step": 6518 }, { "epoch": 0.09734282023906406, "grad_norm": 0.400390625, "grad_norm_var": 0.00163421630859375, "learning_rate": 0.0001, "loss": 1.9121, "loss/crossentropy": 2.459450840950012, "loss/fcd": 1.6171875, "loss/idx": 10.0, "loss/logits": 0.2948927581310272, "step": 6519 }, { "epoch": 0.09735775240967903, "grad_norm": 0.390625, "grad_norm_var": 0.001659250259399414, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.4291744232177734, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21334467828273773, "step": 6520 }, { "epoch": 0.09737268458029401, "grad_norm": 0.390625, "grad_norm_var": 0.0017338434855143229, "learning_rate": 0.0001, "loss": 1.7715, "loss/crossentropy": 2.389396548271179, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.2910536676645279, "step": 6521 }, { "epoch": 0.097387616750909, "grad_norm": 0.404296875, "grad_norm_var": 0.0017148335774739584, "learning_rate": 0.0001, "loss": 1.5959, "loss/crossentropy": 2.5482606887817383, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22086993604898453, "step": 6522 }, { "epoch": 0.09740254892152397, "grad_norm": 0.384765625, "grad_norm_var": 0.0011426130930582681, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.719121813774109, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.2491813451051712, "step": 6523 }, { "epoch": 0.09741748109213896, "grad_norm": 0.431640625, "grad_norm_var": 0.0013943831125895181, "learning_rate": 0.0001, "loss": 1.5942, "loss/crossentropy": 2.8231817483901978, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2230752632021904, "step": 6524 }, { "epoch": 0.09743241326275394, "grad_norm": 0.322265625, "grad_norm_var": 0.0014842828114827475, "learning_rate": 0.0001, "loss": 1.4559, "loss/crossentropy": 2.5875236988067627, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1941334828734398, "step": 6525 }, { "epoch": 0.09744734543336891, "grad_norm": 0.349609375, "grad_norm_var": 0.0014308770497639975, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.4385669231414795, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2469516545534134, "step": 6526 }, { "epoch": 0.0974622776039839, "grad_norm": 0.341796875, "grad_norm_var": 0.0014415582021077473, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.749295234680176, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18668943643569946, "step": 6527 }, { "epoch": 0.09747720977459888, "grad_norm": 0.390625, "grad_norm_var": 0.0012648900349934895, "learning_rate": 0.0001, "loss": 1.6823, "loss/crossentropy": 2.2607911229133606, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.2564959228038788, "step": 6528 }, { "epoch": 0.09749214194521387, "grad_norm": 0.37109375, "grad_norm_var": 0.0011504491170247395, "learning_rate": 0.0001, "loss": 1.5721, "loss/crossentropy": 2.6018619537353516, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21272046864032745, "step": 6529 }, { "epoch": 0.09750707411582885, "grad_norm": 1.015625, "grad_norm_var": 0.02743821144104004, "learning_rate": 0.0001, "loss": 1.7119, "loss/crossentropy": 3.2709970474243164, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.17673521488904953, "step": 6530 }, { "epoch": 0.09752200628644384, "grad_norm": 0.33984375, "grad_norm_var": 0.027279774347941082, "learning_rate": 0.0001, "loss": 1.4259, "loss/crossentropy": 2.579951286315918, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18373076617717743, "step": 6531 }, { "epoch": 0.09753693845705881, "grad_norm": 0.369140625, "grad_norm_var": 0.02716547648111979, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.5288490056991577, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1919994205236435, "step": 6532 }, { "epoch": 0.09755187062767379, "grad_norm": 0.380859375, "grad_norm_var": 0.026849985122680664, "learning_rate": 0.0001, "loss": 1.5447, "loss/crossentropy": 2.8245444297790527, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2243994101881981, "step": 6533 }, { "epoch": 0.09756680279828878, "grad_norm": 0.341796875, "grad_norm_var": 0.026565043131510417, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.610777497291565, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16802185028791428, "step": 6534 }, { "epoch": 0.09758173496890375, "grad_norm": 0.37109375, "grad_norm_var": 0.02667209307352702, "learning_rate": 0.0001, "loss": 1.5107, "loss/crossentropy": 2.4548884630203247, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.18643533438444138, "step": 6535 }, { "epoch": 0.09759666713951874, "grad_norm": 0.33984375, "grad_norm_var": 0.02697955767313639, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.62328839302063, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21918387711048126, "step": 6536 }, { "epoch": 0.09761159931013372, "grad_norm": 0.35546875, "grad_norm_var": 0.027143208185831706, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.5313849449157715, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.194696344435215, "step": 6537 }, { "epoch": 0.0976265314807487, "grad_norm": 0.31640625, "grad_norm_var": 0.027656046549479167, "learning_rate": 0.0001, "loss": 1.591, "loss/crossentropy": 2.540021061897278, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.235533244907856, "step": 6538 }, { "epoch": 0.09764146365136368, "grad_norm": 0.37109375, "grad_norm_var": 0.027697992324829102, "learning_rate": 0.0001, "loss": 1.4495, "loss/crossentropy": 2.4825323820114136, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18384630978107452, "step": 6539 }, { "epoch": 0.09765639582197866, "grad_norm": 0.38671875, "grad_norm_var": 0.027637672424316407, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.708535075187683, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22110316157341003, "step": 6540 }, { "epoch": 0.09767132799259365, "grad_norm": 0.419921875, "grad_norm_var": 0.027251434326171876, "learning_rate": 0.0001, "loss": 1.6125, "loss/crossentropy": 2.4949487447738647, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.22185628861188889, "step": 6541 }, { "epoch": 0.09768626016320862, "grad_norm": 0.458984375, "grad_norm_var": 0.027208709716796876, "learning_rate": 0.0001, "loss": 1.7696, "loss/crossentropy": 2.5164047479629517, "loss/fcd": 1.51953125, "loss/idx": 10.0, "loss/logits": 0.25007812678813934, "step": 6542 }, { "epoch": 0.0977011923338236, "grad_norm": 0.345703125, "grad_norm_var": 0.027173805236816406, "learning_rate": 0.0001, "loss": 1.5865, "loss/crossentropy": 2.4813610315322876, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.20365218818187714, "step": 6543 }, { "epoch": 0.09771612450443859, "grad_norm": 0.337890625, "grad_norm_var": 0.02749009132385254, "learning_rate": 0.0001, "loss": 1.5505, "loss/crossentropy": 2.760603427886963, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.2223711982369423, "step": 6544 }, { "epoch": 0.09773105667505357, "grad_norm": 0.41015625, "grad_norm_var": 0.027395359675089517, "learning_rate": 0.0001, "loss": 1.6297, "loss/crossentropy": 2.6039010286331177, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.22731931507587433, "step": 6545 }, { "epoch": 0.09774598884566855, "grad_norm": 0.353515625, "grad_norm_var": 0.0013323465983072916, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5662670135498047, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20487233251333237, "step": 6546 }, { "epoch": 0.09776092101628353, "grad_norm": 0.482421875, "grad_norm_var": 0.0020552158355712892, "learning_rate": 0.0001, "loss": 1.6958, "loss/crossentropy": 2.588643431663513, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.2270236685872078, "step": 6547 }, { "epoch": 0.09777585318689852, "grad_norm": 0.404296875, "grad_norm_var": 0.0020929813385009766, "learning_rate": 0.0001, "loss": 1.6893, "loss/crossentropy": 2.5722360610961914, "loss/fcd": 1.44921875, "loss/idx": 10.0, "loss/logits": 0.24005717039108276, "step": 6548 }, { "epoch": 0.0977907853575135, "grad_norm": 0.31640625, "grad_norm_var": 0.0023431777954101562, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.495514392852783, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.17176074534654617, "step": 6549 }, { "epoch": 0.09780571752812847, "grad_norm": 0.380859375, "grad_norm_var": 0.0022617975870768228, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.74501895904541, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.17823446542024612, "step": 6550 }, { "epoch": 0.09782064969874346, "grad_norm": 0.3046875, "grad_norm_var": 0.00260009765625, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.253340244293213, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18799841403961182, "step": 6551 }, { "epoch": 0.09783558186935844, "grad_norm": 0.447265625, "grad_norm_var": 0.002831761042277018, "learning_rate": 0.0001, "loss": 1.5772, "loss/crossentropy": 2.412277936935425, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.18267444521188736, "step": 6552 }, { "epoch": 0.09785051403997343, "grad_norm": 0.353515625, "grad_norm_var": 0.002838579813639323, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.5423665046691895, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.19840306043624878, "step": 6553 }, { "epoch": 0.0978654462105884, "grad_norm": 0.388671875, "grad_norm_var": 0.002546294530232747, "learning_rate": 0.0001, "loss": 1.5983, "loss/crossentropy": 2.9392573833465576, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2428741306066513, "step": 6554 }, { "epoch": 0.09788037838120338, "grad_norm": 0.341796875, "grad_norm_var": 0.002654774983723958, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.8022067546844482, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18691381812095642, "step": 6555 }, { "epoch": 0.09789531055181837, "grad_norm": 0.34375, "grad_norm_var": 0.002750587463378906, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.607336640357971, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.209718257188797, "step": 6556 }, { "epoch": 0.09791024272243334, "grad_norm": 0.35546875, "grad_norm_var": 0.002672433853149414, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.724748730659485, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.16879145801067352, "step": 6557 }, { "epoch": 0.09792517489304833, "grad_norm": 0.380859375, "grad_norm_var": 0.002195596694946289, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.6813377141952515, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19665154814720154, "step": 6558 }, { "epoch": 0.09794010706366331, "grad_norm": 0.33984375, "grad_norm_var": 0.0022180557250976564, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.6763577461242676, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1898408979177475, "step": 6559 }, { "epoch": 0.09795503923427828, "grad_norm": 0.3671875, "grad_norm_var": 0.0021410465240478517, "learning_rate": 0.0001, "loss": 1.5172, "loss/crossentropy": 2.84006130695343, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2085815817117691, "step": 6560 }, { "epoch": 0.09796997140489327, "grad_norm": 0.32421875, "grad_norm_var": 0.002178812026977539, "learning_rate": 0.0001, "loss": 1.4751, "loss/crossentropy": 2.708648681640625, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1977185383439064, "step": 6561 }, { "epoch": 0.09798490357550825, "grad_norm": 0.33984375, "grad_norm_var": 0.002216529846191406, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.7594244480133057, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.2019781991839409, "step": 6562 }, { "epoch": 0.09799983574612324, "grad_norm": 0.314453125, "grad_norm_var": 0.0013936360677083333, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.7864454984664917, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20187438279390335, "step": 6563 }, { "epoch": 0.09801476791673822, "grad_norm": 0.376953125, "grad_norm_var": 0.0012659072875976563, "learning_rate": 0.0001, "loss": 1.5952, "loss/crossentropy": 2.5670067071914673, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2201589122414589, "step": 6564 }, { "epoch": 0.09802970008735319, "grad_norm": 1.3203125, "grad_norm_var": 0.05912450154622396, "learning_rate": 0.0001, "loss": 1.6012, "loss/crossentropy": 2.758581757545471, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2105858102440834, "step": 6565 }, { "epoch": 0.09804463225796818, "grad_norm": 0.3515625, "grad_norm_var": 0.05932119687398275, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.5660780668258667, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18876848369836807, "step": 6566 }, { "epoch": 0.09805956442858316, "grad_norm": 0.380859375, "grad_norm_var": 0.05855687459309896, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.6652008295059204, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19574030488729477, "step": 6567 }, { "epoch": 0.09807449659919815, "grad_norm": 0.375, "grad_norm_var": 0.0586245059967041, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.5799291133880615, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20704050362110138, "step": 6568 }, { "epoch": 0.09808942876981312, "grad_norm": 0.33984375, "grad_norm_var": 0.05874989827473958, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.405172348022461, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.15755780041217804, "step": 6569 }, { "epoch": 0.09810436094042811, "grad_norm": 0.408203125, "grad_norm_var": 0.058705075581868486, "learning_rate": 0.0001, "loss": 1.7944, "loss/crossentropy": 2.7129218578338623, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.31391607969999313, "step": 6570 }, { "epoch": 0.09811929311104309, "grad_norm": 0.419921875, "grad_norm_var": 0.05831089019775391, "learning_rate": 0.0001, "loss": 1.5539, "loss/crossentropy": 2.5968395471572876, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22183894366025925, "step": 6571 }, { "epoch": 0.09813422528165806, "grad_norm": 0.396484375, "grad_norm_var": 0.05794053077697754, "learning_rate": 0.0001, "loss": 1.6396, "loss/crossentropy": 2.446293592453003, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.22553998231887817, "step": 6572 }, { "epoch": 0.09814915745227305, "grad_norm": 0.37109375, "grad_norm_var": 0.057812102635701496, "learning_rate": 0.0001, "loss": 1.5686, "loss/crossentropy": 2.7237720489501953, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22090866416692734, "step": 6573 }, { "epoch": 0.09816408962288803, "grad_norm": 0.34375, "grad_norm_var": 0.05811862945556641, "learning_rate": 0.0001, "loss": 1.4448, "loss/crossentropy": 2.7486382722854614, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19479575008153915, "step": 6574 }, { "epoch": 0.09817902179350302, "grad_norm": 0.39453125, "grad_norm_var": 0.057698504130045576, "learning_rate": 0.0001, "loss": 1.5454, "loss/crossentropy": 2.4056460857391357, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20160405337810516, "step": 6575 }, { "epoch": 0.098193953964118, "grad_norm": 0.330078125, "grad_norm_var": 0.05807811419169108, "learning_rate": 0.0001, "loss": 1.3957, "loss/crossentropy": 2.639788508415222, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1691073775291443, "step": 6576 }, { "epoch": 0.09820888613473297, "grad_norm": 0.341796875, "grad_norm_var": 0.057863108317057294, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.4940396547317505, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17423004657030106, "step": 6577 }, { "epoch": 0.09822381830534796, "grad_norm": 0.35546875, "grad_norm_var": 0.05770034790039062, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.686642646789551, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19043047726154327, "step": 6578 }, { "epoch": 0.09823875047596294, "grad_norm": 0.41015625, "grad_norm_var": 0.056845966974894205, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.5669376850128174, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.21525217592716217, "step": 6579 }, { "epoch": 0.09825368264657793, "grad_norm": 0.359375, "grad_norm_var": 0.05699488321940104, "learning_rate": 0.0001, "loss": 1.6143, "loss/crossentropy": 2.7111220359802246, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.23143813014030457, "step": 6580 }, { "epoch": 0.0982686148171929, "grad_norm": 0.388671875, "grad_norm_var": 0.0007916609446207682, "learning_rate": 0.0001, "loss": 1.676, "loss/crossentropy": 2.440187931060791, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.2580425813794136, "step": 6581 }, { "epoch": 0.09828354698780788, "grad_norm": 0.3359375, "grad_norm_var": 0.0008514245351155599, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.4337849617004395, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1770942136645317, "step": 6582 }, { "epoch": 0.09829847915842287, "grad_norm": 0.357421875, "grad_norm_var": 0.0008579095204671224, "learning_rate": 0.0001, "loss": 1.5283, "loss/crossentropy": 2.5933220386505127, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2235828936100006, "step": 6583 }, { "epoch": 0.09831341132903784, "grad_norm": 0.333984375, "grad_norm_var": 0.0009383519490559895, "learning_rate": 0.0001, "loss": 1.41, "loss/crossentropy": 2.646627426147461, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18346233665943146, "step": 6584 }, { "epoch": 0.09832834349965283, "grad_norm": 0.333984375, "grad_norm_var": 0.0009624322255452473, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.6482179164886475, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19527024775743484, "step": 6585 }, { "epoch": 0.09834327567026781, "grad_norm": 0.51171875, "grad_norm_var": 0.002193196614583333, "learning_rate": 0.0001, "loss": 1.8011, "loss/crossentropy": 3.167845129966736, "loss/fcd": 1.55078125, "loss/idx": 10.0, "loss/logits": 0.2502850890159607, "step": 6586 }, { "epoch": 0.09835820784088278, "grad_norm": 0.33984375, "grad_norm_var": 0.0021039168039957683, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.6956883668899536, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18309424817562103, "step": 6587 }, { "epoch": 0.09837314001149777, "grad_norm": 0.388671875, "grad_norm_var": 0.002079121271769206, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.7188560962677, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20638316869735718, "step": 6588 }, { "epoch": 0.09838807218211275, "grad_norm": 0.3515625, "grad_norm_var": 0.0020962874094645183, "learning_rate": 0.0001, "loss": 1.5918, "loss/crossentropy": 2.979625940322876, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.21683067083358765, "step": 6589 }, { "epoch": 0.09840300435272774, "grad_norm": 0.400390625, "grad_norm_var": 0.0021188735961914064, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.526752233505249, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18642397224903107, "step": 6590 }, { "epoch": 0.09841793652334271, "grad_norm": 0.357421875, "grad_norm_var": 0.0020877679189046225, "learning_rate": 0.0001, "loss": 1.5647, "loss/crossentropy": 2.461261034011841, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21700216829776764, "step": 6591 }, { "epoch": 0.0984328686939577, "grad_norm": 0.578125, "grad_norm_var": 0.00466149648030599, "learning_rate": 0.0001, "loss": 1.5662, "loss/crossentropy": 2.4428939819335938, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.20680546760559082, "step": 6592 }, { "epoch": 0.09844780086457268, "grad_norm": 0.369140625, "grad_norm_var": 0.004554239908854166, "learning_rate": 0.0001, "loss": 1.5466, "loss/crossentropy": 2.398833751678467, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19893302023410797, "step": 6593 }, { "epoch": 0.09846273303518766, "grad_norm": 0.369140625, "grad_norm_var": 0.004510736465454102, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.796718120574951, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17943963408470154, "step": 6594 }, { "epoch": 0.09847766520580264, "grad_norm": 0.3671875, "grad_norm_var": 0.004491154352823893, "learning_rate": 0.0001, "loss": 1.4808, "loss/crossentropy": 2.790671467781067, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19172187894582748, "step": 6595 }, { "epoch": 0.09849259737641762, "grad_norm": 0.423828125, "grad_norm_var": 0.0045399347941080725, "learning_rate": 0.0001, "loss": 1.5955, "loss/crossentropy": 2.5077794790267944, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.19314425438642502, "step": 6596 }, { "epoch": 0.09850752954703261, "grad_norm": 0.41015625, "grad_norm_var": 0.004570881525675456, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.4579954147338867, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.22325849533081055, "step": 6597 }, { "epoch": 0.09852246171764759, "grad_norm": 0.373046875, "grad_norm_var": 0.00439300537109375, "learning_rate": 0.0001, "loss": 1.6164, "loss/crossentropy": 2.4827011823654175, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2258242592215538, "step": 6598 }, { "epoch": 0.09853739388826256, "grad_norm": 0.3046875, "grad_norm_var": 0.004807138442993164, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.5433545112609863, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.19663356989622116, "step": 6599 }, { "epoch": 0.09855232605887755, "grad_norm": 0.337890625, "grad_norm_var": 0.004779799779256185, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.70371150970459, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.2061353102326393, "step": 6600 }, { "epoch": 0.09856725822949253, "grad_norm": 0.345703125, "grad_norm_var": 0.004703124364217122, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.57105028629303, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18121102452278137, "step": 6601 }, { "epoch": 0.09858219040010752, "grad_norm": 0.39453125, "grad_norm_var": 0.00364836057027181, "learning_rate": 0.0001, "loss": 1.7111, "loss/crossentropy": 2.2630783319473267, "loss/fcd": 1.49609375, "loss/idx": 10.0, "loss/logits": 0.21497169137001038, "step": 6602 }, { "epoch": 0.09859712257072249, "grad_norm": 0.427734375, "grad_norm_var": 0.0036376317342122397, "learning_rate": 0.0001, "loss": 1.7106, "loss/crossentropy": 2.8632384538650513, "loss/fcd": 1.46484375, "loss/idx": 10.0, "loss/logits": 0.24576247483491898, "step": 6603 }, { "epoch": 0.09861205474133747, "grad_norm": 0.32421875, "grad_norm_var": 0.0038867791493733725, "learning_rate": 0.0001, "loss": 1.4454, "loss/crossentropy": 2.8371468782424927, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19932771474123, "step": 6604 }, { "epoch": 0.09862698691195246, "grad_norm": 0.31640625, "grad_norm_var": 0.0041133721669514975, "learning_rate": 0.0001, "loss": 1.4601, "loss/crossentropy": 2.622017025947571, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19443198293447495, "step": 6605 }, { "epoch": 0.09864191908256743, "grad_norm": 0.38671875, "grad_norm_var": 0.004090118408203125, "learning_rate": 0.0001, "loss": 1.6613, "loss/crossentropy": 2.5652154684066772, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.22376542538404465, "step": 6606 }, { "epoch": 0.09865685125318242, "grad_norm": 0.30859375, "grad_norm_var": 0.00438853899637858, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.627303957939148, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18851500004529953, "step": 6607 }, { "epoch": 0.0986717834237974, "grad_norm": 0.3515625, "grad_norm_var": 0.0015306949615478515, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.526332139968872, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.210920087993145, "step": 6608 }, { "epoch": 0.09868671559441239, "grad_norm": 0.34765625, "grad_norm_var": 0.0015424092610677083, "learning_rate": 0.0001, "loss": 1.5795, "loss/crossentropy": 2.8184767961502075, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21618376672267914, "step": 6609 }, { "epoch": 0.09870164776502736, "grad_norm": 0.314453125, "grad_norm_var": 0.0016759236653645833, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.4793310165405273, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16667314618825912, "step": 6610 }, { "epoch": 0.09871657993564234, "grad_norm": 0.447265625, "grad_norm_var": 0.0021705468495686847, "learning_rate": 0.0001, "loss": 1.9308, "loss/crossentropy": 2.6876513957977295, "loss/fcd": 1.6328125, "loss/idx": 10.0, "loss/logits": 0.2980278208851814, "step": 6611 }, { "epoch": 0.09873151210625733, "grad_norm": 0.35546875, "grad_norm_var": 0.0019118626912434896, "learning_rate": 0.0001, "loss": 1.6164, "loss/crossentropy": 2.6948167085647583, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.2531432807445526, "step": 6612 }, { "epoch": 0.0987464442768723, "grad_norm": 0.640625, "grad_norm_var": 0.006799570719401042, "learning_rate": 0.0001, "loss": 1.586, "loss/crossentropy": 2.145546793937683, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.1836223006248474, "step": 6613 }, { "epoch": 0.0987613764474873, "grad_norm": 0.423828125, "grad_norm_var": 0.006957435607910156, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.7425326108932495, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20583879202604294, "step": 6614 }, { "epoch": 0.09877630861810227, "grad_norm": 0.32421875, "grad_norm_var": 0.0067937215169270836, "learning_rate": 0.0001, "loss": 1.4038, "loss/crossentropy": 2.5919588804244995, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17337147891521454, "step": 6615 }, { "epoch": 0.09879124078871725, "grad_norm": 0.3203125, "grad_norm_var": 0.006906874974568685, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.657580018043518, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.19561021775007248, "step": 6616 }, { "epoch": 0.09880617295933224, "grad_norm": 0.388671875, "grad_norm_var": 0.006843932469685872, "learning_rate": 0.0001, "loss": 1.7004, "loss/crossentropy": 2.7135285139083862, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2668508142232895, "step": 6617 }, { "epoch": 0.09882110512994721, "grad_norm": 0.90234375, "grad_norm_var": 0.02397764523824056, "learning_rate": 0.0001, "loss": 2.2725, "loss/crossentropy": 2.56410551071167, "loss/fcd": 2.02734375, "loss/idx": 10.0, "loss/logits": 0.2451501041650772, "step": 6618 }, { "epoch": 0.0988360373005622, "grad_norm": 0.365234375, "grad_norm_var": 0.02408445676167806, "learning_rate": 0.0001, "loss": 1.6065, "loss/crossentropy": 2.4797513484954834, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2158849611878395, "step": 6619 }, { "epoch": 0.09885096947117718, "grad_norm": 0.359375, "grad_norm_var": 0.023772033055623372, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.6952428817749023, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17266079038381577, "step": 6620 }, { "epoch": 0.09886590164179215, "grad_norm": 0.609375, "grad_norm_var": 0.025498183568318684, "learning_rate": 0.0001, "loss": 1.6597, "loss/crossentropy": 2.1710270643234253, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.22610090672969818, "step": 6621 }, { "epoch": 0.09888083381240714, "grad_norm": 0.396484375, "grad_norm_var": 0.02545057932535807, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.639650583267212, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1950605884194374, "step": 6622 }, { "epoch": 0.09889576598302212, "grad_norm": 0.392578125, "grad_norm_var": 0.024549086888631184, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.5376349687576294, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.16723521053791046, "step": 6623 }, { "epoch": 0.09891069815363711, "grad_norm": 0.33203125, "grad_norm_var": 0.024786869684855144, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.4804069995880127, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18097098171710968, "step": 6624 }, { "epoch": 0.09892563032425208, "grad_norm": 0.4453125, "grad_norm_var": 0.02427824338277181, "learning_rate": 0.0001, "loss": 1.6019, "loss/crossentropy": 2.5992345809936523, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2268921360373497, "step": 6625 }, { "epoch": 0.09894056249486706, "grad_norm": 0.3203125, "grad_norm_var": 0.02418340047200521, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.492505669593811, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19705906510353088, "step": 6626 }, { "epoch": 0.09895549466548205, "grad_norm": 0.337890625, "grad_norm_var": 0.024810028076171876, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.574850082397461, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.17542318999767303, "step": 6627 }, { "epoch": 0.09897042683609703, "grad_norm": 0.31640625, "grad_norm_var": 0.025304667154947915, "learning_rate": 0.0001, "loss": 1.4434, "loss/crossentropy": 2.5560985803604126, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18949677795171738, "step": 6628 }, { "epoch": 0.09898535900671201, "grad_norm": 0.408203125, "grad_norm_var": 0.022144047419230144, "learning_rate": 0.0001, "loss": 1.5794, "loss/crossentropy": 2.473829507827759, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21610085666179657, "step": 6629 }, { "epoch": 0.09900029117732699, "grad_norm": 0.734375, "grad_norm_var": 0.028530375162760416, "learning_rate": 0.0001, "loss": 1.7219, "loss/crossentropy": 2.5467230081558228, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.22970329970121384, "step": 6630 }, { "epoch": 0.09901522334794198, "grad_norm": 0.455078125, "grad_norm_var": 0.02767523129781087, "learning_rate": 0.0001, "loss": 1.6044, "loss/crossentropy": 2.580474853515625, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2293628305196762, "step": 6631 }, { "epoch": 0.09903015551855696, "grad_norm": 0.349609375, "grad_norm_var": 0.02725060780843099, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.68217134475708, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1781691610813141, "step": 6632 }, { "epoch": 0.09904508768917193, "grad_norm": 0.37109375, "grad_norm_var": 0.027400954564412435, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.5361427068710327, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2063213735818863, "step": 6633 }, { "epoch": 0.09906001985978692, "grad_norm": 0.376953125, "grad_norm_var": 0.012508900960286458, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.50770366191864, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18291454762220383, "step": 6634 }, { "epoch": 0.0990749520304019, "grad_norm": 0.3359375, "grad_norm_var": 0.012739928563435872, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.758120536804199, "loss/fcd": 1.1640625, "loss/idx": 10.0, "loss/logits": 0.16648776829242706, "step": 6635 }, { "epoch": 0.09908988420101689, "grad_norm": 0.388671875, "grad_norm_var": 0.012600453694661458, "learning_rate": 0.0001, "loss": 1.6119, "loss/crossentropy": 2.76951265335083, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2291308492422104, "step": 6636 }, { "epoch": 0.09910481637163186, "grad_norm": 0.3125, "grad_norm_var": 0.010242462158203125, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.5064773559570312, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.1614581048488617, "step": 6637 }, { "epoch": 0.09911974854224684, "grad_norm": 0.3359375, "grad_norm_var": 0.010436105728149413, "learning_rate": 0.0001, "loss": 1.5346, "loss/crossentropy": 2.5961971282958984, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20651183277368546, "step": 6638 }, { "epoch": 0.09913468071286183, "grad_norm": 0.3515625, "grad_norm_var": 0.01051788330078125, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.4388351440429688, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18864582479000092, "step": 6639 }, { "epoch": 0.0991496128834768, "grad_norm": 0.32421875, "grad_norm_var": 0.010577646891276042, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.549795150756836, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16646328568458557, "step": 6640 }, { "epoch": 0.0991645450540918, "grad_norm": 0.435546875, "grad_norm_var": 0.010505406061808269, "learning_rate": 0.0001, "loss": 1.616, "loss/crossentropy": 2.6375828981399536, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2370680868625641, "step": 6641 }, { "epoch": 0.09917947722470677, "grad_norm": 0.322265625, "grad_norm_var": 0.0104888916015625, "learning_rate": 0.0001, "loss": 1.5684, "loss/crossentropy": 2.1160377264022827, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21681635081768036, "step": 6642 }, { "epoch": 0.09919440939532174, "grad_norm": 0.361328125, "grad_norm_var": 0.010376739501953124, "learning_rate": 0.0001, "loss": 1.4244, "loss/crossentropy": 2.6467114686965942, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18225347995758057, "step": 6643 }, { "epoch": 0.09920934156593673, "grad_norm": 0.435546875, "grad_norm_var": 0.01015470822652181, "learning_rate": 0.0001, "loss": 1.5984, "loss/crossentropy": 2.484670042991638, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.22337307035923004, "step": 6644 }, { "epoch": 0.09922427373655171, "grad_norm": 0.392578125, "grad_norm_var": 0.010139703750610352, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.5786248445510864, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21648633480072021, "step": 6645 }, { "epoch": 0.0992392059071667, "grad_norm": 0.55859375, "grad_norm_var": 0.0040628910064697266, "learning_rate": 0.0001, "loss": 1.5605, "loss/crossentropy": 2.6239606142044067, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.16982664167881012, "step": 6646 }, { "epoch": 0.09925413807778168, "grad_norm": 0.328125, "grad_norm_var": 0.0038283665974934897, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.634735345840454, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.16769906878471375, "step": 6647 }, { "epoch": 0.09926907024839665, "grad_norm": 0.359375, "grad_norm_var": 0.0038028558095296225, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.5070748329162598, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21101312339305878, "step": 6648 }, { "epoch": 0.09928400241901164, "grad_norm": 0.70703125, "grad_norm_var": 0.01070860226949056, "learning_rate": 0.0001, "loss": 1.5364, "loss/crossentropy": 2.5297698974609375, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.18481283634901047, "step": 6649 }, { "epoch": 0.09929893458962662, "grad_norm": 0.376953125, "grad_norm_var": 0.01070860226949056, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.600661039352417, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17583466321229935, "step": 6650 }, { "epoch": 0.0993138667602416, "grad_norm": 0.34765625, "grad_norm_var": 0.010624297459920247, "learning_rate": 0.0001, "loss": 1.4242, "loss/crossentropy": 2.6056759357452393, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18594176322221756, "step": 6651 }, { "epoch": 0.09932879893085658, "grad_norm": 0.375, "grad_norm_var": 0.010649553934733073, "learning_rate": 0.0001, "loss": 1.5371, "loss/crossentropy": 2.5618293285369873, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21676293015480042, "step": 6652 }, { "epoch": 0.09934373110147157, "grad_norm": 0.322265625, "grad_norm_var": 0.010547749201456706, "learning_rate": 0.0001, "loss": 1.3558, "loss/crossentropy": 2.666766047477722, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.18002939224243164, "step": 6653 }, { "epoch": 0.09935866327208655, "grad_norm": 0.33984375, "grad_norm_var": 0.010517485936482747, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.7423510551452637, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18221215903759003, "step": 6654 }, { "epoch": 0.09937359544270152, "grad_norm": 0.392578125, "grad_norm_var": 0.01037896474202474, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.5574686527252197, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22510840743780136, "step": 6655 }, { "epoch": 0.09938852761331651, "grad_norm": 0.5390625, "grad_norm_var": 0.011130777994791667, "learning_rate": 0.0001, "loss": 1.6631, "loss/crossentropy": 3.3822888135910034, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.17872074246406555, "step": 6656 }, { "epoch": 0.09940345978393149, "grad_norm": 0.33984375, "grad_norm_var": 0.011404148737589518, "learning_rate": 0.0001, "loss": 1.4507, "loss/crossentropy": 2.6440606117248535, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18113850057125092, "step": 6657 }, { "epoch": 0.09941839195454648, "grad_norm": 0.373046875, "grad_norm_var": 0.010997502009073894, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.7422561645507812, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19267138838768005, "step": 6658 }, { "epoch": 0.09943332412516145, "grad_norm": 0.35546875, "grad_norm_var": 0.011037127176920573, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.481305956840515, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1807616949081421, "step": 6659 }, { "epoch": 0.09944825629577643, "grad_norm": 0.341796875, "grad_norm_var": 0.011253801981608073, "learning_rate": 0.0001, "loss": 1.486, "loss/crossentropy": 2.588400959968567, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20473205298185349, "step": 6660 }, { "epoch": 0.09946318846639142, "grad_norm": 0.353515625, "grad_norm_var": 0.011403846740722656, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.2028855681419373, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.20237994194030762, "step": 6661 }, { "epoch": 0.0994781206370064, "grad_norm": 0.396484375, "grad_norm_var": 0.009632094701131185, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.617186188697815, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19852041453123093, "step": 6662 }, { "epoch": 0.09949305280762138, "grad_norm": 0.365234375, "grad_norm_var": 0.009409523010253907, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.7444956302642822, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20383759588003159, "step": 6663 }, { "epoch": 0.09950798497823636, "grad_norm": 0.33203125, "grad_norm_var": 0.009578196207682292, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.580856204032898, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18276174366474152, "step": 6664 }, { "epoch": 0.09952291714885134, "grad_norm": 0.337890625, "grad_norm_var": 0.002545658747355143, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.5040429830551147, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19490352272987366, "step": 6665 }, { "epoch": 0.09953784931946633, "grad_norm": 0.35546875, "grad_norm_var": 0.002548980712890625, "learning_rate": 0.0001, "loss": 1.3827, "loss/crossentropy": 2.657568573951721, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17175836116075516, "step": 6666 }, { "epoch": 0.0995527814900813, "grad_norm": 0.357421875, "grad_norm_var": 0.0025301456451416017, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.7566733360290527, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1677861288189888, "step": 6667 }, { "epoch": 0.09956771366069629, "grad_norm": 0.37109375, "grad_norm_var": 0.0025270938873291015, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.6220909357070923, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.1906268075108528, "step": 6668 }, { "epoch": 0.09958264583131127, "grad_norm": 0.369140625, "grad_norm_var": 0.0023844242095947266, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.8838599920272827, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18608655780553818, "step": 6669 }, { "epoch": 0.09959757800192626, "grad_norm": 0.33984375, "grad_norm_var": 0.0023844242095947266, "learning_rate": 0.0001, "loss": 1.5952, "loss/crossentropy": 2.4878355264663696, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2241414487361908, "step": 6670 }, { "epoch": 0.09961251017254123, "grad_norm": 0.3359375, "grad_norm_var": 0.0024143854777018228, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.403890371322632, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19716264307498932, "step": 6671 }, { "epoch": 0.09962744234315621, "grad_norm": 0.6640625, "grad_norm_var": 0.006267738342285156, "learning_rate": 0.0001, "loss": 1.6234, "loss/crossentropy": 2.549322724342346, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.22883284091949463, "step": 6672 }, { "epoch": 0.0996423745137712, "grad_norm": 0.322265625, "grad_norm_var": 0.006367731094360352, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.562558889389038, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18559996783733368, "step": 6673 }, { "epoch": 0.09965730668438617, "grad_norm": 0.298828125, "grad_norm_var": 0.006713215510050456, "learning_rate": 0.0001, "loss": 1.3713, "loss/crossentropy": 2.564875841140747, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17601227760314941, "step": 6674 }, { "epoch": 0.09967223885500116, "grad_norm": 0.359375, "grad_norm_var": 0.0067073663075764975, "learning_rate": 0.0001, "loss": 1.5529, "loss/crossentropy": 2.3551655411720276, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2052803710103035, "step": 6675 }, { "epoch": 0.09968717102561614, "grad_norm": 0.314453125, "grad_norm_var": 0.006852451960245768, "learning_rate": 0.0001, "loss": 1.2856, "loss/crossentropy": 2.5181286334991455, "loss/fcd": 1.1328125, "loss/idx": 10.0, "loss/logits": 0.15273797512054443, "step": 6676 }, { "epoch": 0.09970210319623111, "grad_norm": 0.369140625, "grad_norm_var": 0.006839481989542643, "learning_rate": 0.0001, "loss": 1.4495, "loss/crossentropy": 2.554733633995056, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18783105164766312, "step": 6677 }, { "epoch": 0.0997170353668461, "grad_norm": 0.376953125, "grad_norm_var": 0.006789255142211914, "learning_rate": 0.0001, "loss": 1.3752, "loss/crossentropy": 2.6205073595046997, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17212490737438202, "step": 6678 }, { "epoch": 0.09973196753746108, "grad_norm": 0.31640625, "grad_norm_var": 0.006948598225911458, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.5299417972564697, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18715308606624603, "step": 6679 }, { "epoch": 0.09974689970807607, "grad_norm": 0.328125, "grad_norm_var": 0.0069660822550455725, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.4158849716186523, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22691118717193604, "step": 6680 }, { "epoch": 0.09976183187869105, "grad_norm": 0.33203125, "grad_norm_var": 0.006988255182902018, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.5375542640686035, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.14850298315286636, "step": 6681 }, { "epoch": 0.09977676404930602, "grad_norm": 0.376953125, "grad_norm_var": 0.006995073954264323, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.5961453914642334, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.24932827055454254, "step": 6682 }, { "epoch": 0.09979169621992101, "grad_norm": 0.361328125, "grad_norm_var": 0.006992340087890625, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.548556327819824, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.183698408305645, "step": 6683 }, { "epoch": 0.09980662839053599, "grad_norm": 0.333984375, "grad_norm_var": 0.007047001520792643, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.4736157655715942, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19150866568088531, "step": 6684 }, { "epoch": 0.09982156056115098, "grad_norm": 0.322265625, "grad_norm_var": 0.007142368952433268, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.688010811805725, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.19088301807641983, "step": 6685 }, { "epoch": 0.09983649273176595, "grad_norm": 0.34765625, "grad_norm_var": 0.007125711441040039, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.6090190410614014, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.2031257301568985, "step": 6686 }, { "epoch": 0.09985142490238093, "grad_norm": 0.3359375, "grad_norm_var": 0.007125711441040039, "learning_rate": 0.0001, "loss": 1.5996, "loss/crossentropy": 2.6179206371307373, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.23239941895008087, "step": 6687 }, { "epoch": 0.09986635707299592, "grad_norm": 0.4296875, "grad_norm_var": 0.001056528091430664, "learning_rate": 0.0001, "loss": 1.5858, "loss/crossentropy": 2.69005286693573, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.22252947837114334, "step": 6688 }, { "epoch": 0.0998812892436109, "grad_norm": 0.34375, "grad_norm_var": 0.001019287109375, "learning_rate": 0.0001, "loss": 1.35, "loss/crossentropy": 2.552179455757141, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.1664014458656311, "step": 6689 }, { "epoch": 0.09989622141422588, "grad_norm": 0.474609375, "grad_norm_var": 0.0018289566040039062, "learning_rate": 0.0001, "loss": 1.5888, "loss/crossentropy": 2.7063311338424683, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.2216458022594452, "step": 6690 }, { "epoch": 0.09991115358484086, "grad_norm": 0.33203125, "grad_norm_var": 0.0018694559733072917, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.4926434755325317, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19753260910511017, "step": 6691 }, { "epoch": 0.09992608575545585, "grad_norm": 0.337890625, "grad_norm_var": 0.0017740885416666667, "learning_rate": 0.0001, "loss": 1.3995, "loss/crossentropy": 2.6327335834503174, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17295286059379578, "step": 6692 }, { "epoch": 0.09994101792607082, "grad_norm": 0.3515625, "grad_norm_var": 0.0017659346262613933, "learning_rate": 0.0001, "loss": 1.6324, "loss/crossentropy": 2.5619524717330933, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.26518647372722626, "step": 6693 }, { "epoch": 0.0999559500966858, "grad_norm": 0.388671875, "grad_norm_var": 0.0018067518870035807, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.3042773008346558, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.18805165588855743, "step": 6694 }, { "epoch": 0.09997088226730079, "grad_norm": 0.435546875, "grad_norm_var": 0.0020481745402018228, "learning_rate": 0.0001, "loss": 1.7993, "loss/crossentropy": 2.205067276954651, "loss/fcd": 1.5703125, "loss/idx": 10.0, "loss/logits": 0.22896741330623627, "step": 6695 }, { "epoch": 0.09998581443791577, "grad_norm": 0.369140625, "grad_norm_var": 0.001954380671183268, "learning_rate": 0.0001, "loss": 1.6659, "loss/crossentropy": 2.801188826560974, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.23235543817281723, "step": 6696 }, { "epoch": 0.10000074660853076, "grad_norm": 0.302734375, "grad_norm_var": 0.0021448771158854166, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.6775656938552856, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19430534541606903, "step": 6697 }, { "epoch": 0.10001567877914573, "grad_norm": 0.3515625, "grad_norm_var": 0.0021454970041910808, "learning_rate": 0.0001, "loss": 1.523, "loss/crossentropy": 2.357596516609192, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21439485251903534, "step": 6698 }, { "epoch": 0.1000306109497607, "grad_norm": 0.32421875, "grad_norm_var": 0.0022430419921875, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.438189148902893, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20165546238422394, "step": 6699 }, { "epoch": 0.1000455431203757, "grad_norm": 0.380859375, "grad_norm_var": 0.00220947265625, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.5182743072509766, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.1872374638915062, "step": 6700 }, { "epoch": 0.10006047529099067, "grad_norm": 0.3671875, "grad_norm_var": 0.002084080378214518, "learning_rate": 0.0001, "loss": 1.4698, "loss/crossentropy": 2.7283897399902344, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1963687464594841, "step": 6701 }, { "epoch": 0.10007540746160566, "grad_norm": 0.34765625, "grad_norm_var": 0.002084080378214518, "learning_rate": 0.0001, "loss": 1.2967, "loss/crossentropy": 2.6235320568084717, "loss/fcd": 1.13671875, "loss/idx": 10.0, "loss/logits": 0.1599772721529007, "step": 6702 }, { "epoch": 0.10009033963222064, "grad_norm": 0.390625, "grad_norm_var": 0.0020440260569254557, "learning_rate": 0.0001, "loss": 1.5881, "loss/crossentropy": 2.871315598487854, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.23262178897857666, "step": 6703 }, { "epoch": 0.10010527180283561, "grad_norm": 0.380859375, "grad_norm_var": 0.0018075942993164063, "learning_rate": 0.0001, "loss": 1.594, "loss/crossentropy": 2.64028537273407, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.23460787534713745, "step": 6704 }, { "epoch": 0.1001202039734506, "grad_norm": 0.3359375, "grad_norm_var": 0.001836077372233073, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.644906997680664, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18545383214950562, "step": 6705 }, { "epoch": 0.10013513614406558, "grad_norm": 0.369140625, "grad_norm_var": 0.0010172526041666667, "learning_rate": 0.0001, "loss": 1.6528, "loss/crossentropy": 2.5958248376846313, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.23096872866153717, "step": 6706 }, { "epoch": 0.10015006831468057, "grad_norm": 0.353515625, "grad_norm_var": 0.0009649753570556641, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.674380660057068, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18648358434438705, "step": 6707 }, { "epoch": 0.10016500048529554, "grad_norm": 0.330078125, "grad_norm_var": 0.0009935855865478515, "learning_rate": 0.0001, "loss": 1.3587, "loss/crossentropy": 2.5934263467788696, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.16734110563993454, "step": 6708 }, { "epoch": 0.10017993265591052, "grad_norm": 0.380859375, "grad_norm_var": 0.0010095596313476562, "learning_rate": 0.0001, "loss": 1.678, "loss/crossentropy": 2.717575192451477, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.24047063291072845, "step": 6709 }, { "epoch": 0.10019486482652551, "grad_norm": 0.3984375, "grad_norm_var": 0.001048898696899414, "learning_rate": 0.0001, "loss": 1.741, "loss/crossentropy": 2.713590383529663, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.29573559761047363, "step": 6710 }, { "epoch": 0.10020979699714049, "grad_norm": 0.333984375, "grad_norm_var": 0.0007199446360270182, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.7102891206741333, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17011871188879013, "step": 6711 }, { "epoch": 0.10022472916775547, "grad_norm": 0.384765625, "grad_norm_var": 0.0007598718007405599, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.501170039176941, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.18434269726276398, "step": 6712 }, { "epoch": 0.10023966133837045, "grad_norm": 0.392578125, "grad_norm_var": 0.0005990187327067057, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.7239317893981934, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21814817935228348, "step": 6713 }, { "epoch": 0.10025459350898544, "grad_norm": 0.330078125, "grad_norm_var": 0.0006631851196289062, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.578370451927185, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19718600064516068, "step": 6714 }, { "epoch": 0.10026952567960042, "grad_norm": 0.35546875, "grad_norm_var": 0.0005645116170247395, "learning_rate": 0.0001, "loss": 1.5249, "loss/crossentropy": 2.5735079050064087, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20852269977331161, "step": 6715 }, { "epoch": 0.10028445785021539, "grad_norm": 0.337890625, "grad_norm_var": 0.0005861918131510416, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.526458501815796, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18191725760698318, "step": 6716 }, { "epoch": 0.10029939002083038, "grad_norm": 0.36328125, "grad_norm_var": 0.0005843480428059896, "learning_rate": 0.0001, "loss": 1.5353, "loss/crossentropy": 2.506135106086731, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21495287865400314, "step": 6717 }, { "epoch": 0.10031432219144536, "grad_norm": 0.32421875, "grad_norm_var": 0.0006621678670247396, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.4315789937973022, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.17985863983631134, "step": 6718 }, { "epoch": 0.10032925436206035, "grad_norm": 0.34765625, "grad_norm_var": 0.00060272216796875, "learning_rate": 0.0001, "loss": 1.3885, "loss/crossentropy": 2.753472089767456, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1736505925655365, "step": 6719 }, { "epoch": 0.10034418653267532, "grad_norm": 0.341796875, "grad_norm_var": 0.000576019287109375, "learning_rate": 0.0001, "loss": 1.4179, "loss/crossentropy": 2.627281904220581, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18747249245643616, "step": 6720 }, { "epoch": 0.1003591187032903, "grad_norm": 0.392578125, "grad_norm_var": 0.0006327152252197266, "learning_rate": 0.0001, "loss": 1.5641, "loss/crossentropy": 2.652393937110901, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22421159595251083, "step": 6721 }, { "epoch": 0.10037405087390529, "grad_norm": 0.392578125, "grad_norm_var": 0.0007002353668212891, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.553027629852295, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19621462374925613, "step": 6722 }, { "epoch": 0.10038898304452026, "grad_norm": 0.3203125, "grad_norm_var": 0.0007977803548177083, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.66091251373291, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19607459008693695, "step": 6723 }, { "epoch": 0.10040391521513525, "grad_norm": 0.384765625, "grad_norm_var": 0.0007817586263020833, "learning_rate": 0.0001, "loss": 1.5224, "loss/crossentropy": 2.5136630535125732, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19818077981472015, "step": 6724 }, { "epoch": 0.10041884738575023, "grad_norm": 0.35546875, "grad_norm_var": 0.000755929946899414, "learning_rate": 0.0001, "loss": 1.6248, "loss/crossentropy": 2.5247164964675903, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21854159235954285, "step": 6725 }, { "epoch": 0.1004337795563652, "grad_norm": 0.365234375, "grad_norm_var": 0.0006535212198893229, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.5325881242752075, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21036796271800995, "step": 6726 }, { "epoch": 0.1004487117269802, "grad_norm": 0.326171875, "grad_norm_var": 0.0006820042928059896, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.267000436782837, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.16986537724733353, "step": 6727 }, { "epoch": 0.10046364389759517, "grad_norm": 0.3203125, "grad_norm_var": 0.0007045586903889974, "learning_rate": 0.0001, "loss": 1.4313, "loss/crossentropy": 2.711987614631653, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18912090361118317, "step": 6728 }, { "epoch": 0.10047857606821016, "grad_norm": 0.349609375, "grad_norm_var": 0.0005940596262613932, "learning_rate": 0.0001, "loss": 1.5259, "loss/crossentropy": 2.704915165901184, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.23290076106786728, "step": 6729 }, { "epoch": 0.10049350823882514, "grad_norm": 0.2890625, "grad_norm_var": 0.000810686747233073, "learning_rate": 0.0001, "loss": 1.2811, "loss/crossentropy": 2.44112229347229, "loss/fcd": 1.1328125, "loss/idx": 10.0, "loss/logits": 0.14825539290905, "step": 6730 }, { "epoch": 0.10050844040944013, "grad_norm": 0.38671875, "grad_norm_var": 0.0009032567342122396, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.5905803442001343, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19822534173727036, "step": 6731 }, { "epoch": 0.1005233725800551, "grad_norm": 0.345703125, "grad_norm_var": 0.000894610087076823, "learning_rate": 0.0001, "loss": 1.571, "loss/crossentropy": 2.6993919610977173, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22721099108457565, "step": 6732 }, { "epoch": 0.10053830475067008, "grad_norm": 0.330078125, "grad_norm_var": 0.0009062290191650391, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.729207396507263, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19184017926454544, "step": 6733 }, { "epoch": 0.10055323692128507, "grad_norm": 0.3203125, "grad_norm_var": 0.0009197076161702474, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.439937472343445, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.15633275359869003, "step": 6734 }, { "epoch": 0.10056816909190004, "grad_norm": 0.384765625, "grad_norm_var": 0.0010039647420247396, "learning_rate": 0.0001, "loss": 1.4776, "loss/crossentropy": 2.644134521484375, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19243793934583664, "step": 6735 }, { "epoch": 0.10058310126251503, "grad_norm": 0.392578125, "grad_norm_var": 0.0011072794596354166, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.5311696529388428, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20972707122564316, "step": 6736 }, { "epoch": 0.10059803343313001, "grad_norm": 0.37890625, "grad_norm_var": 0.0010477542877197266, "learning_rate": 0.0001, "loss": 1.5761, "loss/crossentropy": 2.570286989212036, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22058765590190887, "step": 6737 }, { "epoch": 0.10061296560374498, "grad_norm": 0.404296875, "grad_norm_var": 0.0011187076568603515, "learning_rate": 0.0001, "loss": 1.8494, "loss/crossentropy": 2.393090009689331, "loss/fcd": 1.55859375, "loss/idx": 10.0, "loss/logits": 0.29082587361335754, "step": 6738 }, { "epoch": 0.10062789777435997, "grad_norm": 0.341796875, "grad_norm_var": 0.0010527928670247396, "learning_rate": 0.0001, "loss": 1.5494, "loss/crossentropy": 2.5799375772476196, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20567850023508072, "step": 6739 }, { "epoch": 0.10064282994497495, "grad_norm": 0.4140625, "grad_norm_var": 0.00122373898824056, "learning_rate": 0.0001, "loss": 1.7677, "loss/crossentropy": 2.4009227752685547, "loss/fcd": 1.515625, "loss/idx": 10.0, "loss/logits": 0.2520891800522804, "step": 6740 }, { "epoch": 0.10065776211558994, "grad_norm": 0.3203125, "grad_norm_var": 0.0013061364491780598, "learning_rate": 0.0001, "loss": 1.4143, "loss/crossentropy": 2.616269588470459, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1916727051138878, "step": 6741 }, { "epoch": 0.10067269428620491, "grad_norm": 0.37890625, "grad_norm_var": 0.0013376235961914062, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.7918858528137207, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1887017786502838, "step": 6742 }, { "epoch": 0.10068762645681989, "grad_norm": 0.3671875, "grad_norm_var": 0.001283884048461914, "learning_rate": 0.0001, "loss": 1.5269, "loss/crossentropy": 2.8992546796798706, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21833428740501404, "step": 6743 }, { "epoch": 0.10070255862743488, "grad_norm": 0.412109375, "grad_norm_var": 0.0013518651326497396, "learning_rate": 0.0001, "loss": 1.6325, "loss/crossentropy": 2.396876335144043, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.222324937582016, "step": 6744 }, { "epoch": 0.10071749079804986, "grad_norm": 0.333984375, "grad_norm_var": 0.0013961156209309896, "learning_rate": 0.0001, "loss": 1.3421, "loss/crossentropy": 2.8760725259780884, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1663421094417572, "step": 6745 }, { "epoch": 0.10073242296866484, "grad_norm": 0.365234375, "grad_norm_var": 0.0010124047597249348, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.5872840881347656, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1636582911014557, "step": 6746 }, { "epoch": 0.10074735513927982, "grad_norm": 0.396484375, "grad_norm_var": 0.0010436375935872395, "learning_rate": 0.0001, "loss": 1.7543, "loss/crossentropy": 2.275310754776001, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.2699260711669922, "step": 6747 }, { "epoch": 0.1007622873098948, "grad_norm": 0.36328125, "grad_norm_var": 0.001010878880818685, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.5436272621154785, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.2169921100139618, "step": 6748 }, { "epoch": 0.10077721948050979, "grad_norm": 0.39453125, "grad_norm_var": 0.0009358723958333334, "learning_rate": 0.0001, "loss": 1.5988, "loss/crossentropy": 2.7244755029678345, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.23556188493967056, "step": 6749 }, { "epoch": 0.10079215165112476, "grad_norm": 0.353515625, "grad_norm_var": 0.0007713158925374349, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.6445837020874023, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1813090518116951, "step": 6750 }, { "epoch": 0.10080708382173975, "grad_norm": 0.37109375, "grad_norm_var": 0.0007654190063476563, "learning_rate": 0.0001, "loss": 1.5689, "loss/crossentropy": 2.497426748275757, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21733355522155762, "step": 6751 }, { "epoch": 0.10082201599235473, "grad_norm": 0.357421875, "grad_norm_var": 0.0007568359375, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.8580245971679688, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19954662770032883, "step": 6752 }, { "epoch": 0.10083694816296972, "grad_norm": 0.33203125, "grad_norm_var": 0.0008514404296875, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.5854259729385376, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21512188017368317, "step": 6753 }, { "epoch": 0.10085188033358469, "grad_norm": 0.38671875, "grad_norm_var": 0.0007883548736572265, "learning_rate": 0.0001, "loss": 1.528, "loss/crossentropy": 2.4200336933135986, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.19600047171115875, "step": 6754 }, { "epoch": 0.10086681250419967, "grad_norm": 0.40234375, "grad_norm_var": 0.0008056004842122396, "learning_rate": 0.0001, "loss": 1.5722, "loss/crossentropy": 2.520390033721924, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2401481196284294, "step": 6755 }, { "epoch": 0.10088174467481466, "grad_norm": 0.322265625, "grad_norm_var": 0.0008153120676676432, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.7085646390914917, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17605309188365936, "step": 6756 }, { "epoch": 0.10089667684542963, "grad_norm": 0.33203125, "grad_norm_var": 0.0007523695627848307, "learning_rate": 0.0001, "loss": 1.5741, "loss/crossentropy": 2.7387603521347046, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2186017483472824, "step": 6757 }, { "epoch": 0.10091160901604462, "grad_norm": 0.40625, "grad_norm_var": 0.0008431593577067057, "learning_rate": 0.0001, "loss": 1.5454, "loss/crossentropy": 2.3879019021987915, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20942757278680801, "step": 6758 }, { "epoch": 0.1009265411866596, "grad_norm": 0.349609375, "grad_norm_var": 0.0008656183878580729, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.52150821685791, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20580044388771057, "step": 6759 }, { "epoch": 0.10094147335727457, "grad_norm": 0.421875, "grad_norm_var": 0.0009297529856363932, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.812743663787842, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18536171317100525, "step": 6760 }, { "epoch": 0.10095640552788956, "grad_norm": 0.359375, "grad_norm_var": 0.0008547465006510417, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.5908385515213013, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21535330265760422, "step": 6761 }, { "epoch": 0.10097133769850454, "grad_norm": 0.35546875, "grad_norm_var": 0.0008664290110270182, "learning_rate": 0.0001, "loss": 1.5684, "loss/crossentropy": 2.497899055480957, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22076790034770966, "step": 6762 }, { "epoch": 0.10098626986911953, "grad_norm": 0.376953125, "grad_norm_var": 0.0008187452952067057, "learning_rate": 0.0001, "loss": 1.568, "loss/crossentropy": 2.614689826965332, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21645408123731613, "step": 6763 }, { "epoch": 0.1010012020397345, "grad_norm": 0.31640625, "grad_norm_var": 0.0009843031565348306, "learning_rate": 0.0001, "loss": 1.4948, "loss/crossentropy": 2.60906982421875, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20178808271884918, "step": 6764 }, { "epoch": 0.10101613421034948, "grad_norm": 0.361328125, "grad_norm_var": 0.0009218851725260416, "learning_rate": 0.0001, "loss": 1.485, "loss/crossentropy": 2.6298907995224, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.2037401795387268, "step": 6765 }, { "epoch": 0.10103106638096447, "grad_norm": 0.330078125, "grad_norm_var": 0.0009852091471354166, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.6160773038864136, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18767016381025314, "step": 6766 }, { "epoch": 0.10104599855157945, "grad_norm": 0.390625, "grad_norm_var": 0.0010344823201497396, "learning_rate": 0.0001, "loss": 1.4309, "loss/crossentropy": 2.5154846906661987, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17697512358427048, "step": 6767 }, { "epoch": 0.10106093072219444, "grad_norm": 0.5625, "grad_norm_var": 0.003522857030232747, "learning_rate": 0.0001, "loss": 1.7145, "loss/crossentropy": 2.4125803112983704, "loss/fcd": 1.49609375, "loss/idx": 10.0, "loss/logits": 0.2183752804994583, "step": 6768 }, { "epoch": 0.10107586289280941, "grad_norm": 0.99609375, "grad_norm_var": 0.027247095108032228, "learning_rate": 0.0001, "loss": 2.0501, "loss/crossentropy": 2.833680748939514, "loss/fcd": 1.70703125, "loss/idx": 10.0, "loss/logits": 0.34309619665145874, "step": 6769 }, { "epoch": 0.10109079506342439, "grad_norm": 0.341796875, "grad_norm_var": 0.027553812662760416, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.611355185508728, "loss/fcd": 1.1484375, "loss/idx": 10.0, "loss/logits": 0.15959452837705612, "step": 6770 }, { "epoch": 0.10110572723403938, "grad_norm": 0.37890625, "grad_norm_var": 0.027624766031901043, "learning_rate": 0.0001, "loss": 1.6613, "loss/crossentropy": 2.3786885738372803, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.23946832120418549, "step": 6771 }, { "epoch": 0.10112065940465435, "grad_norm": 0.3828125, "grad_norm_var": 0.027124643325805664, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.347312331199646, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20873858779668808, "step": 6772 }, { "epoch": 0.10113559157526934, "grad_norm": 0.326171875, "grad_norm_var": 0.02719268798828125, "learning_rate": 0.0001, "loss": 1.569, "loss/crossentropy": 2.7425462007522583, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.2057669758796692, "step": 6773 }, { "epoch": 0.10115052374588432, "grad_norm": 0.37109375, "grad_norm_var": 0.027315711975097655, "learning_rate": 0.0001, "loss": 1.4815, "loss/crossentropy": 2.384253740310669, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18464846909046173, "step": 6774 }, { "epoch": 0.10116545591649931, "grad_norm": 0.328125, "grad_norm_var": 0.02752849260965983, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.6039944887161255, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17786931991577148, "step": 6775 }, { "epoch": 0.10118038808711428, "grad_norm": 0.337890625, "grad_norm_var": 0.02786407470703125, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.4716798067092896, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17884016782045364, "step": 6776 }, { "epoch": 0.10119532025772926, "grad_norm": 0.36328125, "grad_norm_var": 0.027840105692545573, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.6464744806289673, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.2044277787208557, "step": 6777 }, { "epoch": 0.10121025242834425, "grad_norm": 0.546875, "grad_norm_var": 0.02880274454752604, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.647442102432251, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18448526412248611, "step": 6778 }, { "epoch": 0.10122518459895923, "grad_norm": 0.390625, "grad_norm_var": 0.028736988703409832, "learning_rate": 0.0001, "loss": 1.4363, "loss/crossentropy": 2.570061683654785, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18241893500089645, "step": 6779 }, { "epoch": 0.10124011676957421, "grad_norm": 0.330078125, "grad_norm_var": 0.028559303283691405, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.4383902549743652, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21666359156370163, "step": 6780 }, { "epoch": 0.10125504894018919, "grad_norm": 0.34765625, "grad_norm_var": 0.0286800225575765, "learning_rate": 0.0001, "loss": 1.5245, "loss/crossentropy": 2.5247777700424194, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.204192653298378, "step": 6781 }, { "epoch": 0.10126998111080417, "grad_norm": 0.35546875, "grad_norm_var": 0.0284149169921875, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.6534340381622314, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.20278331637382507, "step": 6782 }, { "epoch": 0.10128491328141916, "grad_norm": 0.353515625, "grad_norm_var": 0.028655608495076496, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.773690938949585, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19421299546957016, "step": 6783 }, { "epoch": 0.10129984545203413, "grad_norm": 0.33984375, "grad_norm_var": 0.027510436375935872, "learning_rate": 0.0001, "loss": 1.3734, "loss/crossentropy": 2.4825457334518433, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16637281328439713, "step": 6784 }, { "epoch": 0.10131477762264912, "grad_norm": 0.388671875, "grad_norm_var": 0.0027498881022135417, "learning_rate": 0.0001, "loss": 1.6167, "loss/crossentropy": 2.409664750099182, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.20657654106616974, "step": 6785 }, { "epoch": 0.1013297097932641, "grad_norm": 0.34375, "grad_norm_var": 0.002743387222290039, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.5184710025787354, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.2036839723587036, "step": 6786 }, { "epoch": 0.10134464196387907, "grad_norm": 0.408203125, "grad_norm_var": 0.002840423583984375, "learning_rate": 0.0001, "loss": 1.6389, "loss/crossentropy": 2.6096699237823486, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.2326015681028366, "step": 6787 }, { "epoch": 0.10135957413449406, "grad_norm": 0.35546875, "grad_norm_var": 0.0028390884399414062, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.678963541984558, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1960090920329094, "step": 6788 }, { "epoch": 0.10137450630510904, "grad_norm": 0.3359375, "grad_norm_var": 0.002790689468383789, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.279628098011017, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.1825946718454361, "step": 6789 }, { "epoch": 0.10138943847572403, "grad_norm": 0.376953125, "grad_norm_var": 0.002794837951660156, "learning_rate": 0.0001, "loss": 1.3651, "loss/crossentropy": 2.6578279733657837, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.1620129495859146, "step": 6790 }, { "epoch": 0.101404370646339, "grad_norm": 0.388671875, "grad_norm_var": 0.0026948134104410808, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.54449462890625, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2077426016330719, "step": 6791 }, { "epoch": 0.101419302816954, "grad_norm": 0.365234375, "grad_norm_var": 0.0026147047678629558, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.6040111780166626, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20645006000995636, "step": 6792 }, { "epoch": 0.10143423498756897, "grad_norm": 0.34375, "grad_norm_var": 0.0026674747467041017, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.757929801940918, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17424525320529938, "step": 6793 }, { "epoch": 0.10144916715818394, "grad_norm": 0.404296875, "grad_norm_var": 0.0006357828776041666, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.62608540058136, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.18563763797283173, "step": 6794 }, { "epoch": 0.10146409932879893, "grad_norm": 0.357421875, "grad_norm_var": 0.0005879561106363932, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.5331084728240967, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1913183405995369, "step": 6795 }, { "epoch": 0.10147903149941391, "grad_norm": 0.37109375, "grad_norm_var": 0.0005175272623697916, "learning_rate": 0.0001, "loss": 1.486, "loss/crossentropy": 2.7663108110427856, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19688871502876282, "step": 6796 }, { "epoch": 0.1014939636700289, "grad_norm": 0.306640625, "grad_norm_var": 0.0007161299387613932, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.4730652570724487, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.1687570884823799, "step": 6797 }, { "epoch": 0.10150889584064388, "grad_norm": 0.369140625, "grad_norm_var": 0.0007155736287434896, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.604902505874634, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.16534312069416046, "step": 6798 }, { "epoch": 0.10152382801125885, "grad_norm": 0.37109375, "grad_norm_var": 0.0007125695546468098, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.561508893966675, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21308697760105133, "step": 6799 }, { "epoch": 0.10153876018187384, "grad_norm": 0.330078125, "grad_norm_var": 0.0007501602172851563, "learning_rate": 0.0001, "loss": 1.4155, "loss/crossentropy": 2.7255066633224487, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1850440800189972, "step": 6800 }, { "epoch": 0.10155369235248882, "grad_norm": 0.318359375, "grad_norm_var": 0.0008234024047851562, "learning_rate": 0.0001, "loss": 1.3698, "loss/crossentropy": 2.746517062187195, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1744898334145546, "step": 6801 }, { "epoch": 0.1015686245231038, "grad_norm": 0.326171875, "grad_norm_var": 0.0008787631988525391, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.6443780660629272, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18219058215618134, "step": 6802 }, { "epoch": 0.10158355669371878, "grad_norm": 0.361328125, "grad_norm_var": 0.000702524185180664, "learning_rate": 0.0001, "loss": 1.5005, "loss/crossentropy": 2.5193777084350586, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.187999926507473, "step": 6803 }, { "epoch": 0.10159848886433376, "grad_norm": 0.353515625, "grad_norm_var": 0.000702667236328125, "learning_rate": 0.0001, "loss": 1.4469, "loss/crossentropy": 2.8317724466323853, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1929592788219452, "step": 6804 }, { "epoch": 0.10161342103494875, "grad_norm": 0.365234375, "grad_norm_var": 0.000681924819946289, "learning_rate": 0.0001, "loss": 1.6596, "loss/crossentropy": 2.359486699104309, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2377251386642456, "step": 6805 }, { "epoch": 0.10162835320556372, "grad_norm": 0.33984375, "grad_norm_var": 0.0006683349609375, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.6328084468841553, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19648471474647522, "step": 6806 }, { "epoch": 0.10164328537617871, "grad_norm": 0.33203125, "grad_norm_var": 0.0006107171376546224, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.700552821159363, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17661084979772568, "step": 6807 }, { "epoch": 0.10165821754679369, "grad_norm": 0.421875, "grad_norm_var": 0.0009190877278645833, "learning_rate": 0.0001, "loss": 1.7138, "loss/crossentropy": 2.8038655519485474, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.25678397715091705, "step": 6808 }, { "epoch": 0.10167314971740866, "grad_norm": 0.35546875, "grad_norm_var": 0.0009108861287434896, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.790649652481079, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2047848328948021, "step": 6809 }, { "epoch": 0.10168808188802365, "grad_norm": 0.376953125, "grad_norm_var": 0.0007787068684895833, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.6088619232177734, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2092486172914505, "step": 6810 }, { "epoch": 0.10170301405863863, "grad_norm": 0.341796875, "grad_norm_var": 0.00078582763671875, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.611506223678589, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19566602259874344, "step": 6811 }, { "epoch": 0.10171794622925362, "grad_norm": 0.359375, "grad_norm_var": 0.0007654190063476563, "learning_rate": 0.0001, "loss": 1.5339, "loss/crossentropy": 2.5336912870407104, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20970190316438675, "step": 6812 }, { "epoch": 0.1017328783998686, "grad_norm": 0.345703125, "grad_norm_var": 0.0006255467732747395, "learning_rate": 0.0001, "loss": 1.5258, "loss/crossentropy": 2.540697455406189, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19767163693904877, "step": 6813 }, { "epoch": 0.10174781057048359, "grad_norm": 0.37109375, "grad_norm_var": 0.0006296634674072266, "learning_rate": 0.0001, "loss": 1.3625, "loss/crossentropy": 2.702876091003418, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.17886711657047272, "step": 6814 }, { "epoch": 0.10176274274109856, "grad_norm": 0.34765625, "grad_norm_var": 0.0006117343902587891, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.650570511817932, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18729235231876373, "step": 6815 }, { "epoch": 0.10177767491171354, "grad_norm": 0.349609375, "grad_norm_var": 0.0005761305491129557, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.555426597595215, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.20242533832788467, "step": 6816 }, { "epoch": 0.10179260708232853, "grad_norm": 0.447265625, "grad_norm_var": 0.0009999434153238932, "learning_rate": 0.0001, "loss": 1.7209, "loss/crossentropy": 2.6158447265625, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.28341422975063324, "step": 6817 }, { "epoch": 0.1018075392529435, "grad_norm": 0.345703125, "grad_norm_var": 0.0009300072987874348, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.54550039768219, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19042949378490448, "step": 6818 }, { "epoch": 0.10182247142355849, "grad_norm": 0.2890625, "grad_norm_var": 0.001276397705078125, "learning_rate": 0.0001, "loss": 1.2814, "loss/crossentropy": 2.622506618499756, "loss/fcd": 1.13671875, "loss/idx": 10.0, "loss/logits": 0.14464574307203293, "step": 6819 }, { "epoch": 0.10183740359417347, "grad_norm": 0.333984375, "grad_norm_var": 0.0013142267862955729, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.676241874694824, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1804373636841774, "step": 6820 }, { "epoch": 0.10185233576478844, "grad_norm": 0.400390625, "grad_norm_var": 0.0014269510904947917, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.710317373275757, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18569138646125793, "step": 6821 }, { "epoch": 0.10186726793540343, "grad_norm": 0.349609375, "grad_norm_var": 0.00140684445699056, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.646668791770935, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1799672394990921, "step": 6822 }, { "epoch": 0.10188220010601841, "grad_norm": 0.328125, "grad_norm_var": 0.0014226118723551433, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.3544886112213135, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.2687440514564514, "step": 6823 }, { "epoch": 0.1018971322766334, "grad_norm": 0.3359375, "grad_norm_var": 0.001177835464477539, "learning_rate": 0.0001, "loss": 1.4558, "loss/crossentropy": 2.5608922243118286, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19799897819757462, "step": 6824 }, { "epoch": 0.10191206444724837, "grad_norm": 0.40234375, "grad_norm_var": 0.001318979263305664, "learning_rate": 0.0001, "loss": 1.4548, "loss/crossentropy": 2.7037196159362793, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19305582344532013, "step": 6825 }, { "epoch": 0.10192699661786335, "grad_norm": 0.345703125, "grad_norm_var": 0.0013001600901285807, "learning_rate": 0.0001, "loss": 1.5276, "loss/crossentropy": 2.5862733125686646, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21117711067199707, "step": 6826 }, { "epoch": 0.10194192878847834, "grad_norm": 0.41015625, "grad_norm_var": 0.0014642715454101563, "learning_rate": 0.0001, "loss": 1.9993, "loss/crossentropy": 2.643382430076599, "loss/fcd": 1.68359375, "loss/idx": 10.0, "loss/logits": 0.31572768837213516, "step": 6827 }, { "epoch": 0.10195686095909332, "grad_norm": 0.365234375, "grad_norm_var": 0.0014658451080322265, "learning_rate": 0.0001, "loss": 1.5774, "loss/crossentropy": 2.5406917333602905, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20239388197660446, "step": 6828 }, { "epoch": 0.1019717931297083, "grad_norm": 0.33984375, "grad_norm_var": 0.0014795303344726563, "learning_rate": 0.0001, "loss": 1.5668, "loss/crossentropy": 2.558605432510376, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21128331124782562, "step": 6829 }, { "epoch": 0.10198672530032328, "grad_norm": 0.349609375, "grad_norm_var": 0.0014769077301025391, "learning_rate": 0.0001, "loss": 1.5785, "loss/crossentropy": 2.481651544570923, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2308521568775177, "step": 6830 }, { "epoch": 0.10200165747093826, "grad_norm": 0.37109375, "grad_norm_var": 0.0014765262603759766, "learning_rate": 0.0001, "loss": 1.5212, "loss/crossentropy": 2.66173779964447, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20087897032499313, "step": 6831 }, { "epoch": 0.10201658964155325, "grad_norm": 0.337890625, "grad_norm_var": 0.0015017032623291016, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.5104986429214478, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19995791465044022, "step": 6832 }, { "epoch": 0.10203152181216822, "grad_norm": 0.333984375, "grad_norm_var": 0.0009780724843343098, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.472571611404419, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.1969640627503395, "step": 6833 }, { "epoch": 0.10204645398278321, "grad_norm": 0.333984375, "grad_norm_var": 0.0009971459706624348, "learning_rate": 0.0001, "loss": 1.389, "loss/crossentropy": 2.636669874191284, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17810769379138947, "step": 6834 }, { "epoch": 0.10206138615339819, "grad_norm": 0.400390625, "grad_norm_var": 0.0008422215779622396, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.6832125186920166, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20604941248893738, "step": 6835 }, { "epoch": 0.10207631832401318, "grad_norm": 0.322265625, "grad_norm_var": 0.0008893330891927083, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.4758658409118652, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19013907760381699, "step": 6836 }, { "epoch": 0.10209125049462815, "grad_norm": 0.33984375, "grad_norm_var": 0.0007755120595296223, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.4692063331604004, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2365719974040985, "step": 6837 }, { "epoch": 0.10210618266524313, "grad_norm": 0.318359375, "grad_norm_var": 0.0008553663889567057, "learning_rate": 0.0001, "loss": 1.4934, "loss/crossentropy": 2.4637575149536133, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19258029013872147, "step": 6838 }, { "epoch": 0.10212111483585812, "grad_norm": 0.392578125, "grad_norm_var": 0.0009083429972330729, "learning_rate": 0.0001, "loss": 1.6883, "loss/crossentropy": 2.6018500328063965, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.26250898838043213, "step": 6839 }, { "epoch": 0.1021360470064731, "grad_norm": 0.328125, "grad_norm_var": 0.0009332656860351563, "learning_rate": 0.0001, "loss": 1.306, "loss/crossentropy": 2.6032639741897583, "loss/fcd": 1.1484375, "loss/idx": 10.0, "loss/logits": 0.15754090249538422, "step": 6840 }, { "epoch": 0.10215097917708808, "grad_norm": 0.30859375, "grad_norm_var": 0.0008996963500976562, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.4383981227874756, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16333775967359543, "step": 6841 }, { "epoch": 0.10216591134770306, "grad_norm": 0.373046875, "grad_norm_var": 0.0009312947591145833, "learning_rate": 0.0001, "loss": 1.4262, "loss/crossentropy": 2.350147008895874, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17227932065725327, "step": 6842 }, { "epoch": 0.10218084351831803, "grad_norm": 0.32421875, "grad_norm_var": 0.0007214864095052083, "learning_rate": 0.0001, "loss": 1.4508, "loss/crossentropy": 2.4277522563934326, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.20076629519462585, "step": 6843 }, { "epoch": 0.10219577568893302, "grad_norm": 0.408203125, "grad_norm_var": 0.0009459813435872395, "learning_rate": 0.0001, "loss": 1.6929, "loss/crossentropy": 2.5638216733932495, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.23192373663187027, "step": 6844 }, { "epoch": 0.102210707859548, "grad_norm": 0.353515625, "grad_norm_var": 0.0009411970774332682, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.620309591293335, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1884540095925331, "step": 6845 }, { "epoch": 0.10222564003016299, "grad_norm": 0.3515625, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.8115698099136353, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17591112852096558, "step": 6846 }, { "epoch": 0.10224057220077797, "grad_norm": 0.3515625, "grad_norm_var": 0.0009099324544270833, "learning_rate": 0.0001, "loss": 1.4763, "loss/crossentropy": 2.583843231201172, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1989489495754242, "step": 6847 }, { "epoch": 0.10225550437139294, "grad_norm": 0.361328125, "grad_norm_var": 0.0009106953938802083, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.6386808156967163, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18332670629024506, "step": 6848 }, { "epoch": 0.10227043654200793, "grad_norm": 0.328125, "grad_norm_var": 0.0009254296620686849, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.63324511051178, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.202975794672966, "step": 6849 }, { "epoch": 0.1022853687126229, "grad_norm": 0.306640625, "grad_norm_var": 0.0010295708974202473, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.2607028484344482, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.16118034720420837, "step": 6850 }, { "epoch": 0.1023003008832379, "grad_norm": 0.3125, "grad_norm_var": 0.0008986790974934896, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.4474233388900757, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20396727323532104, "step": 6851 }, { "epoch": 0.10231523305385287, "grad_norm": 0.357421875, "grad_norm_var": 0.0008809407552083333, "learning_rate": 0.0001, "loss": 1.4964, "loss/crossentropy": 2.717299222946167, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.21905633807182312, "step": 6852 }, { "epoch": 0.10233016522446786, "grad_norm": 0.326171875, "grad_norm_var": 0.0009015242258707683, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.61505126953125, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18275239318609238, "step": 6853 }, { "epoch": 0.10234509739508284, "grad_norm": 0.34375, "grad_norm_var": 0.0008554458618164062, "learning_rate": 0.0001, "loss": 1.6002, "loss/crossentropy": 2.5635184049606323, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.22909186035394669, "step": 6854 }, { "epoch": 0.10236002956569781, "grad_norm": 0.359375, "grad_norm_var": 0.0007157484690348307, "learning_rate": 0.0001, "loss": 1.5546, "loss/crossentropy": 2.561513304710388, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22253337502479553, "step": 6855 }, { "epoch": 0.1023749617363128, "grad_norm": 0.337890625, "grad_norm_var": 0.0007018407185872395, "learning_rate": 0.0001, "loss": 1.521, "loss/crossentropy": 2.3766911029815674, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21242332458496094, "step": 6856 }, { "epoch": 0.10238989390692778, "grad_norm": 0.36328125, "grad_norm_var": 0.0006306330362955729, "learning_rate": 0.0001, "loss": 1.505, "loss/crossentropy": 2.6678727865219116, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1924753561615944, "step": 6857 }, { "epoch": 0.10240482607754277, "grad_norm": 0.337890625, "grad_norm_var": 0.0005877176920572917, "learning_rate": 0.0001, "loss": 1.4395, "loss/crossentropy": 2.4846253395080566, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18556836992502213, "step": 6858 }, { "epoch": 0.10241975824815774, "grad_norm": 0.3046875, "grad_norm_var": 0.0006662368774414063, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.612424373626709, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.19091110676527023, "step": 6859 }, { "epoch": 0.10243469041877272, "grad_norm": 0.384765625, "grad_norm_var": 0.0004999160766601563, "learning_rate": 0.0001, "loss": 1.5038, "loss/crossentropy": 2.7093827724456787, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19909146428108215, "step": 6860 }, { "epoch": 0.10244962258938771, "grad_norm": 0.38671875, "grad_norm_var": 0.0006174564361572266, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.641335964202881, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.20529381185770035, "step": 6861 }, { "epoch": 0.10246455476000269, "grad_norm": 0.322265625, "grad_norm_var": 0.0006439208984375, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.5640798807144165, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2105601280927658, "step": 6862 }, { "epoch": 0.10247948693061767, "grad_norm": 0.341796875, "grad_norm_var": 0.0006384372711181641, "learning_rate": 0.0001, "loss": 1.5583, "loss/crossentropy": 2.603134512901306, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2106238678097725, "step": 6863 }, { "epoch": 0.10249441910123265, "grad_norm": 0.357421875, "grad_norm_var": 0.0006294091542561849, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.5871552228927612, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20560412108898163, "step": 6864 }, { "epoch": 0.10250935127184763, "grad_norm": 0.427734375, "grad_norm_var": 0.0010663350423177084, "learning_rate": 0.0001, "loss": 1.7153, "loss/crossentropy": 2.4533538818359375, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.23487596213817596, "step": 6865 }, { "epoch": 0.10252428344246262, "grad_norm": 0.341796875, "grad_norm_var": 0.0009490331013997396, "learning_rate": 0.0001, "loss": 1.5272, "loss/crossentropy": 2.522216558456421, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2068551853299141, "step": 6866 }, { "epoch": 0.10253921561307759, "grad_norm": 0.3359375, "grad_norm_var": 0.0008651097615559896, "learning_rate": 0.0001, "loss": 1.3622, "loss/crossentropy": 2.474809169769287, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.15903636813163757, "step": 6867 }, { "epoch": 0.10255414778369258, "grad_norm": 0.388671875, "grad_norm_var": 0.0009495417277018229, "learning_rate": 0.0001, "loss": 1.388, "loss/crossentropy": 2.817808985710144, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16926880925893784, "step": 6868 }, { "epoch": 0.10256907995430756, "grad_norm": 0.31640625, "grad_norm_var": 0.0009914239247639973, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.830514073371887, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1711719110608101, "step": 6869 }, { "epoch": 0.10258401212492253, "grad_norm": 0.3125, "grad_norm_var": 0.0010916233062744141, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.4527628421783447, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17719318717718124, "step": 6870 }, { "epoch": 0.10259894429553752, "grad_norm": 0.361328125, "grad_norm_var": 0.0010939915974934897, "learning_rate": 0.0001, "loss": 1.5902, "loss/crossentropy": 2.2397927045822144, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.21133900433778763, "step": 6871 }, { "epoch": 0.1026138764661525, "grad_norm": 0.359375, "grad_norm_var": 0.0010843753814697265, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.801877737045288, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19292089343070984, "step": 6872 }, { "epoch": 0.10262880863676749, "grad_norm": 0.31640625, "grad_norm_var": 0.0011553287506103516, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.6978613138198853, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1777985841035843, "step": 6873 }, { "epoch": 0.10264374080738246, "grad_norm": 0.34765625, "grad_norm_var": 0.0011458714803059896, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.4978095293045044, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18921610713005066, "step": 6874 }, { "epoch": 0.10265867297799745, "grad_norm": 0.328125, "grad_norm_var": 0.0010375340779622396, "learning_rate": 0.0001, "loss": 1.5595, "loss/crossentropy": 2.896276831626892, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.20407478511333466, "step": 6875 }, { "epoch": 0.10267360514861243, "grad_norm": 0.3984375, "grad_norm_var": 0.00110929807027181, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.669482707977295, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19158221036195755, "step": 6876 }, { "epoch": 0.1026885373192274, "grad_norm": 0.32421875, "grad_norm_var": 0.0010696252187093098, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.569803476333618, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.15548871457576752, "step": 6877 }, { "epoch": 0.1027034694898424, "grad_norm": 0.369140625, "grad_norm_var": 0.001041396458943685, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.696001648902893, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20390993356704712, "step": 6878 }, { "epoch": 0.10271840166045737, "grad_norm": 0.41015625, "grad_norm_var": 0.0012433369954427083, "learning_rate": 0.0001, "loss": 1.5434, "loss/crossentropy": 2.5722599029541016, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19578617811203003, "step": 6879 }, { "epoch": 0.10273333383107236, "grad_norm": 0.376953125, "grad_norm_var": 0.0012709935506184896, "learning_rate": 0.0001, "loss": 1.5865, "loss/crossentropy": 2.459999203681946, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21537824720144272, "step": 6880 }, { "epoch": 0.10274826600168734, "grad_norm": 0.34375, "grad_norm_var": 0.0009217421213785807, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.523553490638733, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.22087278962135315, "step": 6881 }, { "epoch": 0.10276319817230231, "grad_norm": 0.384765625, "grad_norm_var": 0.0009790897369384766, "learning_rate": 0.0001, "loss": 1.6318, "loss/crossentropy": 2.4760076999664307, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.22947100549936295, "step": 6882 }, { "epoch": 0.1027781303429173, "grad_norm": 0.359375, "grad_norm_var": 0.0009550571441650391, "learning_rate": 0.0001, "loss": 1.5961, "loss/crossentropy": 2.4125096797943115, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.22892752289772034, "step": 6883 }, { "epoch": 0.10279306251353228, "grad_norm": 0.38671875, "grad_norm_var": 0.000946807861328125, "learning_rate": 0.0001, "loss": 1.8673, "loss/crossentropy": 2.3571925163269043, "loss/fcd": 1.578125, "loss/idx": 10.0, "loss/logits": 0.2891382873058319, "step": 6884 }, { "epoch": 0.10280799468414727, "grad_norm": 0.33984375, "grad_norm_var": 0.0008575439453125, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.2248438000679016, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.21431727707386017, "step": 6885 }, { "epoch": 0.10282292685476224, "grad_norm": 0.396484375, "grad_norm_var": 0.0007953484853108724, "learning_rate": 0.0001, "loss": 1.5535, "loss/crossentropy": 2.512876033782959, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.2136368304491043, "step": 6886 }, { "epoch": 0.10283785902537722, "grad_norm": 0.373046875, "grad_norm_var": 0.0008018334706624349, "learning_rate": 0.0001, "loss": 1.4854, "loss/crossentropy": 2.499642491340637, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19632337242364883, "step": 6887 }, { "epoch": 0.10285279119599221, "grad_norm": 0.341796875, "grad_norm_var": 0.0008305867513020833, "learning_rate": 0.0001, "loss": 1.3377, "loss/crossentropy": 2.4712408781051636, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1619018167257309, "step": 6888 }, { "epoch": 0.10286772336660718, "grad_norm": 0.375, "grad_norm_var": 0.0006865819295247395, "learning_rate": 0.0001, "loss": 1.5182, "loss/crossentropy": 2.489895224571228, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2056661769747734, "step": 6889 }, { "epoch": 0.10288265553722217, "grad_norm": 0.365234375, "grad_norm_var": 0.0006629784901936848, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.8143445253372192, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17947788536548615, "step": 6890 }, { "epoch": 0.10289758770783715, "grad_norm": 0.60546875, "grad_norm_var": 0.004030466079711914, "learning_rate": 0.0001, "loss": 1.6941, "loss/crossentropy": 2.604351758956909, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.22143805027008057, "step": 6891 }, { "epoch": 0.10291251987845212, "grad_norm": 0.376953125, "grad_norm_var": 0.004019101460774739, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.767201066017151, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18361325562000275, "step": 6892 }, { "epoch": 0.10292745204906711, "grad_norm": 0.359375, "grad_norm_var": 0.0038205464680989582, "learning_rate": 0.0001, "loss": 1.5999, "loss/crossentropy": 2.5083101987838745, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.22882810980081558, "step": 6893 }, { "epoch": 0.10294238421968209, "grad_norm": 0.53515625, "grad_norm_var": 0.00518644650777181, "learning_rate": 0.0001, "loss": 1.5709, "loss/crossentropy": 2.516119599342346, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.2037503868341446, "step": 6894 }, { "epoch": 0.10295731639029708, "grad_norm": 0.39453125, "grad_norm_var": 0.005171442031860351, "learning_rate": 0.0001, "loss": 1.5352, "loss/crossentropy": 2.611461043357849, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2188068851828575, "step": 6895 }, { "epoch": 0.10297224856091206, "grad_norm": 0.69921875, "grad_norm_var": 0.010901832580566406, "learning_rate": 0.0001, "loss": 1.5975, "loss/crossentropy": 2.748636841773987, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.21073131263256073, "step": 6896 }, { "epoch": 0.10298718073152704, "grad_norm": 0.37109375, "grad_norm_var": 0.010689544677734374, "learning_rate": 0.0001, "loss": 1.5028, "loss/crossentropy": 2.6137431859970093, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19416970014572144, "step": 6897 }, { "epoch": 0.10300211290214202, "grad_norm": 0.43359375, "grad_norm_var": 0.010631926854451497, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.5438637733459473, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20018815249204636, "step": 6898 }, { "epoch": 0.103017045072757, "grad_norm": 0.419921875, "grad_norm_var": 0.010375213623046876, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.4041961431503296, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20705751329660416, "step": 6899 }, { "epoch": 0.10303197724337199, "grad_norm": 0.44921875, "grad_norm_var": 0.010314178466796876, "learning_rate": 0.0001, "loss": 1.4782, "loss/crossentropy": 2.5052554607391357, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20082378387451172, "step": 6900 }, { "epoch": 0.10304690941398696, "grad_norm": 0.369140625, "grad_norm_var": 0.01002640724182129, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.6924551725387573, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.187808059155941, "step": 6901 }, { "epoch": 0.10306184158460195, "grad_norm": 0.384765625, "grad_norm_var": 0.01008591651916504, "learning_rate": 0.0001, "loss": 1.563, "loss/crossentropy": 2.850257396697998, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21924711018800735, "step": 6902 }, { "epoch": 0.10307677375521693, "grad_norm": 0.44921875, "grad_norm_var": 0.009886932373046876, "learning_rate": 0.0001, "loss": 1.8409, "loss/crossentropy": 2.5734649896621704, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.3057497590780258, "step": 6903 }, { "epoch": 0.1030917059258319, "grad_norm": 0.384765625, "grad_norm_var": 0.009479204813639322, "learning_rate": 0.0001, "loss": 1.3707, "loss/crossentropy": 2.455726385116577, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16759269684553146, "step": 6904 }, { "epoch": 0.10310663809644689, "grad_norm": 0.33984375, "grad_norm_var": 0.009841410319010417, "learning_rate": 0.0001, "loss": 1.5281, "loss/crossentropy": 2.5784465074539185, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21559759974479675, "step": 6905 }, { "epoch": 0.10312157026706187, "grad_norm": 0.380859375, "grad_norm_var": 0.009714253743489583, "learning_rate": 0.0001, "loss": 1.5671, "loss/crossentropy": 2.910441517829895, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.23896904289722443, "step": 6906 }, { "epoch": 0.10313650243767686, "grad_norm": 0.365234375, "grad_norm_var": 0.007847197850545247, "learning_rate": 0.0001, "loss": 1.5045, "loss/crossentropy": 2.559905767440796, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19985750317573547, "step": 6907 }, { "epoch": 0.10315143460829183, "grad_norm": 0.34765625, "grad_norm_var": 0.00806725819905599, "learning_rate": 0.0001, "loss": 1.6192, "loss/crossentropy": 2.5605177879333496, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.23635021597146988, "step": 6908 }, { "epoch": 0.10316636677890681, "grad_norm": 0.314453125, "grad_norm_var": 0.008542871475219727, "learning_rate": 0.0001, "loss": 1.3444, "loss/crossentropy": 2.5387388467788696, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16082942485809326, "step": 6909 }, { "epoch": 0.1031812989495218, "grad_norm": 0.349609375, "grad_norm_var": 0.007719930013020833, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.715502619743347, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1579660251736641, "step": 6910 }, { "epoch": 0.10319623112013677, "grad_norm": 0.318359375, "grad_norm_var": 0.008171828587849934, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.548450469970703, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19724326580762863, "step": 6911 }, { "epoch": 0.10321116329075176, "grad_norm": 0.32421875, "grad_norm_var": 0.0019279321034749348, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.6212018728256226, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1833975464105606, "step": 6912 }, { "epoch": 0.10322609546136674, "grad_norm": 0.3125, "grad_norm_var": 0.0021739800771077475, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.4436510801315308, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20270511507987976, "step": 6913 }, { "epoch": 0.10324102763198173, "grad_norm": 0.3671875, "grad_norm_var": 0.0018994490305582683, "learning_rate": 0.0001, "loss": 1.505, "loss/crossentropy": 2.731534957885742, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.200324185192585, "step": 6914 }, { "epoch": 0.1032559598025967, "grad_norm": 0.404296875, "grad_norm_var": 0.0018050988515218099, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.5021069049835205, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.22082732617855072, "step": 6915 }, { "epoch": 0.10327089197321168, "grad_norm": 0.341796875, "grad_norm_var": 0.0013391494750976563, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.63551664352417, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16954811662435532, "step": 6916 }, { "epoch": 0.10328582414382667, "grad_norm": 0.40625, "grad_norm_var": 0.001472330093383789, "learning_rate": 0.0001, "loss": 1.6696, "loss/crossentropy": 2.887903094291687, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.24379990994930267, "step": 6917 }, { "epoch": 0.10330075631444165, "grad_norm": 0.703125, "grad_norm_var": 0.008775838216145833, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.6843336820602417, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2531866282224655, "step": 6918 }, { "epoch": 0.10331568848505664, "grad_norm": 0.408203125, "grad_norm_var": 0.008512481053670248, "learning_rate": 0.0001, "loss": 1.547, "loss/crossentropy": 2.5983386039733887, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.22664763778448105, "step": 6919 }, { "epoch": 0.10333062065567161, "grad_norm": 0.328125, "grad_norm_var": 0.00867150624593099, "learning_rate": 0.0001, "loss": 1.3789, "loss/crossentropy": 2.4697033166885376, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17971903085708618, "step": 6920 }, { "epoch": 0.10334555282628659, "grad_norm": 0.359375, "grad_norm_var": 0.008601888020833334, "learning_rate": 0.0001, "loss": 1.4764, "loss/crossentropy": 2.5731446743011475, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19902095943689346, "step": 6921 }, { "epoch": 0.10336048499690158, "grad_norm": 0.39453125, "grad_norm_var": 0.008620691299438477, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.745972990989685, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19869670271873474, "step": 6922 }, { "epoch": 0.10337541716751655, "grad_norm": 0.330078125, "grad_norm_var": 0.008756875991821289, "learning_rate": 0.0001, "loss": 1.3826, "loss/crossentropy": 2.6813660860061646, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.183369942009449, "step": 6923 }, { "epoch": 0.10339034933813154, "grad_norm": 0.337890625, "grad_norm_var": 0.008799235026041666, "learning_rate": 0.0001, "loss": 1.6819, "loss/crossentropy": 2.666988492012024, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.26785808801651, "step": 6924 }, { "epoch": 0.10340528150874652, "grad_norm": 0.33203125, "grad_norm_var": 0.008676640192667643, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.5558847188949585, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.1873033046722412, "step": 6925 }, { "epoch": 0.1034202136793615, "grad_norm": 0.36328125, "grad_norm_var": 0.008640034993489584, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.7615586519241333, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19213703274726868, "step": 6926 }, { "epoch": 0.10343514584997648, "grad_norm": 0.38671875, "grad_norm_var": 0.008398040135701498, "learning_rate": 0.0001, "loss": 1.5463, "loss/crossentropy": 2.6475884914398193, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19868525117635727, "step": 6927 }, { "epoch": 0.10345007802059146, "grad_norm": 0.7890625, "grad_norm_var": 0.018369785944620767, "learning_rate": 0.0001, "loss": 1.6323, "loss/crossentropy": 2.6987072229385376, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.21045778691768646, "step": 6928 }, { "epoch": 0.10346501019120645, "grad_norm": 0.392578125, "grad_norm_var": 0.017726580301920574, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.73064386844635, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2215459644794464, "step": 6929 }, { "epoch": 0.10347994236182143, "grad_norm": 0.326171875, "grad_norm_var": 0.018094746271769206, "learning_rate": 0.0001, "loss": 1.4678, "loss/crossentropy": 2.655944585800171, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.202172189950943, "step": 6930 }, { "epoch": 0.1034948745324364, "grad_norm": 0.396484375, "grad_norm_var": 0.01810733477274577, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.5608131885528564, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17408516258001328, "step": 6931 }, { "epoch": 0.10350980670305139, "grad_norm": 0.375, "grad_norm_var": 0.017864418029785157, "learning_rate": 0.0001, "loss": 1.4409, "loss/crossentropy": 2.724845290184021, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17531532049179077, "step": 6932 }, { "epoch": 0.10352473887366637, "grad_norm": 0.3828125, "grad_norm_var": 0.017923927307128905, "learning_rate": 0.0001, "loss": 1.6204, "loss/crossentropy": 2.510064482688904, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23363880068063736, "step": 6933 }, { "epoch": 0.10353967104428136, "grad_norm": 0.31640625, "grad_norm_var": 0.01230316162109375, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.5207080841064453, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1721126213669777, "step": 6934 }, { "epoch": 0.10355460321489633, "grad_norm": 0.34765625, "grad_norm_var": 0.012374607721964519, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.48144793510437, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18258824944496155, "step": 6935 }, { "epoch": 0.10356953538551132, "grad_norm": 0.435546875, "grad_norm_var": 0.012282816569010417, "learning_rate": 0.0001, "loss": 1.572, "loss/crossentropy": 2.495478868484497, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.23211613297462463, "step": 6936 }, { "epoch": 0.1035844675561263, "grad_norm": 0.341796875, "grad_norm_var": 0.012377659479777018, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.42964506149292, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2002619281411171, "step": 6937 }, { "epoch": 0.10359939972674127, "grad_norm": 0.333984375, "grad_norm_var": 0.012574259440104167, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.7024930715560913, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.20920677483081818, "step": 6938 }, { "epoch": 0.10361433189735626, "grad_norm": 0.353515625, "grad_norm_var": 0.012431589762369792, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.660399317741394, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18917962908744812, "step": 6939 }, { "epoch": 0.10362926406797124, "grad_norm": 0.3203125, "grad_norm_var": 0.012568775812784832, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.656475782394409, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18752521276474, "step": 6940 }, { "epoch": 0.10364419623858623, "grad_norm": 0.353515625, "grad_norm_var": 0.012439918518066407, "learning_rate": 0.0001, "loss": 1.5281, "loss/crossentropy": 2.576154947280884, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.207807719707489, "step": 6941 }, { "epoch": 0.1036591284092012, "grad_norm": 0.330078125, "grad_norm_var": 0.012620147069295247, "learning_rate": 0.0001, "loss": 1.4992, "loss/crossentropy": 2.5537034273147583, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.21401605010032654, "step": 6942 }, { "epoch": 0.10367406057981618, "grad_norm": 0.36328125, "grad_norm_var": 0.012653334935506185, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.280517816543579, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.18771256506443024, "step": 6943 }, { "epoch": 0.10368899275043117, "grad_norm": 0.306640625, "grad_norm_var": 0.001201311747233073, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.6229212284088135, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.19128210097551346, "step": 6944 }, { "epoch": 0.10370392492104615, "grad_norm": 0.34375, "grad_norm_var": 0.0011039574940999348, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.504857897758484, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19861829280853271, "step": 6945 }, { "epoch": 0.10371885709166113, "grad_norm": 0.373046875, "grad_norm_var": 0.00108183224995931, "learning_rate": 0.0001, "loss": 1.6441, "loss/crossentropy": 2.481290817260742, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2222488597035408, "step": 6946 }, { "epoch": 0.10373378926227611, "grad_norm": 0.349609375, "grad_norm_var": 0.0009574731190999349, "learning_rate": 0.0001, "loss": 1.3826, "loss/crossentropy": 2.53558886051178, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1716964840888977, "step": 6947 }, { "epoch": 0.10374872143289109, "grad_norm": 0.36328125, "grad_norm_var": 0.0009296258290608724, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.410451889038086, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.19494039565324783, "step": 6948 }, { "epoch": 0.10376365360350608, "grad_norm": 0.396484375, "grad_norm_var": 0.0009993871053059896, "learning_rate": 0.0001, "loss": 1.5819, "loss/crossentropy": 2.690467357635498, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21862547099590302, "step": 6949 }, { "epoch": 0.10377858577412105, "grad_norm": 0.34375, "grad_norm_var": 0.00091705322265625, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.558215022087097, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1898881196975708, "step": 6950 }, { "epoch": 0.10379351794473604, "grad_norm": 0.40234375, "grad_norm_var": 0.001061248779296875, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.740514874458313, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19753386080265045, "step": 6951 }, { "epoch": 0.10380845011535102, "grad_norm": 0.4140625, "grad_norm_var": 0.0008649031321207682, "learning_rate": 0.0001, "loss": 1.6406, "loss/crossentropy": 2.55289363861084, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.2382589429616928, "step": 6952 }, { "epoch": 0.10382338228596599, "grad_norm": 0.34375, "grad_norm_var": 0.0008615493774414062, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.721089243888855, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1906386837363243, "step": 6953 }, { "epoch": 0.10383831445658098, "grad_norm": 0.40234375, "grad_norm_var": 0.0009555657704671224, "learning_rate": 0.0001, "loss": 1.45, "loss/crossentropy": 2.524116277694702, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17264283448457718, "step": 6954 }, { "epoch": 0.10385324662719596, "grad_norm": 0.33203125, "grad_norm_var": 0.001002947489420573, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.428911328315735, "loss/fcd": 1.12890625, "loss/idx": 10.0, "loss/logits": 0.14585894346237183, "step": 6955 }, { "epoch": 0.10386817879781095, "grad_norm": 0.3515625, "grad_norm_var": 0.0009042739868164063, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.5046918392181396, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.1873137429356575, "step": 6956 }, { "epoch": 0.10388311096842592, "grad_norm": 0.35546875, "grad_norm_var": 0.0009026686350504557, "learning_rate": 0.0001, "loss": 1.5611, "loss/crossentropy": 2.5079081058502197, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2094946801662445, "step": 6957 }, { "epoch": 0.10389804313904091, "grad_norm": 0.453125, "grad_norm_var": 0.001346270243326823, "learning_rate": 0.0001, "loss": 1.6073, "loss/crossentropy": 2.868705153465271, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.23618003726005554, "step": 6958 }, { "epoch": 0.10391297530965589, "grad_norm": 0.396484375, "grad_norm_var": 0.0013924757639567056, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.674582839012146, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1698054075241089, "step": 6959 }, { "epoch": 0.10392790748027086, "grad_norm": 0.333984375, "grad_norm_var": 0.0012064456939697265, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.5436220169067383, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2126321792602539, "step": 6960 }, { "epoch": 0.10394283965088585, "grad_norm": 0.36328125, "grad_norm_var": 0.0011562188466389975, "learning_rate": 0.0001, "loss": 1.5373, "loss/crossentropy": 2.388398289680481, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2169686108827591, "step": 6961 }, { "epoch": 0.10395777182150083, "grad_norm": 0.357421875, "grad_norm_var": 0.0011722405751546225, "learning_rate": 0.0001, "loss": 1.5213, "loss/crossentropy": 2.6195249557495117, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.1931571587920189, "step": 6962 }, { "epoch": 0.10397270399211582, "grad_norm": 0.390625, "grad_norm_var": 0.0011525472005208333, "learning_rate": 0.0001, "loss": 1.5091, "loss/crossentropy": 2.435229778289795, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.18876274675130844, "step": 6963 }, { "epoch": 0.1039876361627308, "grad_norm": 0.333984375, "grad_norm_var": 0.0012519677480061849, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.562928318977356, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18044014275074005, "step": 6964 }, { "epoch": 0.10400256833334577, "grad_norm": 0.34375, "grad_norm_var": 0.0012618382771809897, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.672658920288086, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.23261812329292297, "step": 6965 }, { "epoch": 0.10401750050396076, "grad_norm": 0.376953125, "grad_norm_var": 0.0012150923411051431, "learning_rate": 0.0001, "loss": 1.402, "loss/crossentropy": 2.602534294128418, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18324093520641327, "step": 6966 }, { "epoch": 0.10403243267457574, "grad_norm": 0.3671875, "grad_norm_var": 0.0011498610178629558, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.4398903846740723, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19078634679317474, "step": 6967 }, { "epoch": 0.10404736484519073, "grad_norm": 0.302734375, "grad_norm_var": 0.0012667338053385417, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.4767669439315796, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17535457760095596, "step": 6968 }, { "epoch": 0.1040622970158057, "grad_norm": 0.419921875, "grad_norm_var": 0.0014359633127848308, "learning_rate": 0.0001, "loss": 1.7488, "loss/crossentropy": 2.7998872995376587, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.2566307708621025, "step": 6969 }, { "epoch": 0.10407722918642068, "grad_norm": 0.353515625, "grad_norm_var": 0.001358477274576823, "learning_rate": 0.0001, "loss": 1.4495, "loss/crossentropy": 2.453445792198181, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.17601607739925385, "step": 6970 }, { "epoch": 0.10409216135703567, "grad_norm": 0.3671875, "grad_norm_var": 0.0012835184733072916, "learning_rate": 0.0001, "loss": 1.6185, "loss/crossentropy": 2.7170209884643555, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.22392212599515915, "step": 6971 }, { "epoch": 0.10410709352765064, "grad_norm": 0.3359375, "grad_norm_var": 0.0013303120930989584, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.53289794921875, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.16495653986930847, "step": 6972 }, { "epoch": 0.10412202569826563, "grad_norm": 0.369140625, "grad_norm_var": 0.0013233025868733723, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.430857300758362, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21765117347240448, "step": 6973 }, { "epoch": 0.10413695786888061, "grad_norm": 0.322265625, "grad_norm_var": 0.00088348388671875, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.684594988822937, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1789247691631317, "step": 6974 }, { "epoch": 0.1041518900394956, "grad_norm": 0.359375, "grad_norm_var": 0.000781106948852539, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.6660447120666504, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1720680445432663, "step": 6975 }, { "epoch": 0.10416682221011057, "grad_norm": 0.33984375, "grad_norm_var": 0.0007659912109375, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.339569091796875, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20855986326932907, "step": 6976 }, { "epoch": 0.10418175438072555, "grad_norm": 0.330078125, "grad_norm_var": 0.0008046309153238933, "learning_rate": 0.0001, "loss": 1.4535, "loss/crossentropy": 2.677482843399048, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19182386249303818, "step": 6977 }, { "epoch": 0.10419668655134054, "grad_norm": 0.330078125, "grad_norm_var": 0.0008402347564697265, "learning_rate": 0.0001, "loss": 1.5952, "loss/crossentropy": 2.665740966796875, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.23584935069084167, "step": 6978 }, { "epoch": 0.10421161872195552, "grad_norm": 0.375, "grad_norm_var": 0.0007764021555582682, "learning_rate": 0.0001, "loss": 1.6134, "loss/crossentropy": 2.505843997001648, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.24226944148540497, "step": 6979 }, { "epoch": 0.1042265508925705, "grad_norm": 0.34375, "grad_norm_var": 0.0007593154907226563, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.566055178642273, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1872469261288643, "step": 6980 }, { "epoch": 0.10424148306318548, "grad_norm": 0.3671875, "grad_norm_var": 0.0007669448852539063, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.7340604066848755, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.15684914588928223, "step": 6981 }, { "epoch": 0.10425641523380046, "grad_norm": 0.318359375, "grad_norm_var": 0.000800323486328125, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.708130359649658, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1926732212305069, "step": 6982 }, { "epoch": 0.10427134740441545, "grad_norm": 0.31640625, "grad_norm_var": 0.0008457819620768229, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.4201489686965942, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.20595894008874893, "step": 6983 }, { "epoch": 0.10428627957503042, "grad_norm": 0.318359375, "grad_norm_var": 0.0007689793904622396, "learning_rate": 0.0001, "loss": 1.3416, "loss/crossentropy": 2.5595587491989136, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.16193297505378723, "step": 6984 }, { "epoch": 0.10430121174564541, "grad_norm": 0.341796875, "grad_norm_var": 0.00040022532145182293, "learning_rate": 0.0001, "loss": 1.518, "loss/crossentropy": 2.602683186531067, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20554794371128082, "step": 6985 }, { "epoch": 0.10431614391626039, "grad_norm": 0.361328125, "grad_norm_var": 0.0004149754842122396, "learning_rate": 0.0001, "loss": 1.479, "loss/crossentropy": 2.487916588783264, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18998367339372635, "step": 6986 }, { "epoch": 0.10433107608687536, "grad_norm": 0.341796875, "grad_norm_var": 0.0003750960032145182, "learning_rate": 0.0001, "loss": 1.4603, "loss/crossentropy": 2.757380962371826, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.2063712701201439, "step": 6987 }, { "epoch": 0.10434600825749035, "grad_norm": 0.36328125, "grad_norm_var": 0.00040001869201660155, "learning_rate": 0.0001, "loss": 1.5819, "loss/crossentropy": 2.6815072298049927, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.21469109505414963, "step": 6988 }, { "epoch": 0.10436094042810533, "grad_norm": 0.3203125, "grad_norm_var": 0.0003829320271809896, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.6078816652297974, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17392025142908096, "step": 6989 }, { "epoch": 0.10437587259872032, "grad_norm": 0.390625, "grad_norm_var": 0.0005081017812093099, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.6608835458755493, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21878641843795776, "step": 6990 }, { "epoch": 0.1043908047693353, "grad_norm": 0.326171875, "grad_norm_var": 0.0005126953125, "learning_rate": 0.0001, "loss": 1.4114, "loss/crossentropy": 2.5764119625091553, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17704397439956665, "step": 6991 }, { "epoch": 0.10440573693995027, "grad_norm": 0.3359375, "grad_norm_var": 0.0005151748657226563, "learning_rate": 0.0001, "loss": 1.3886, "loss/crossentropy": 2.9314284324645996, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.18161837756633759, "step": 6992 }, { "epoch": 0.10442066911056526, "grad_norm": 0.431640625, "grad_norm_var": 0.0009912490844726563, "learning_rate": 0.0001, "loss": 1.6146, "loss/crossentropy": 2.357695460319519, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.21224994957447052, "step": 6993 }, { "epoch": 0.10443560128118023, "grad_norm": 0.337890625, "grad_norm_var": 0.0009754816691080729, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.6302268505096436, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.20402870327234268, "step": 6994 }, { "epoch": 0.10445053345179522, "grad_norm": 0.474609375, "grad_norm_var": 0.0019360701243082681, "learning_rate": 0.0001, "loss": 1.6073, "loss/crossentropy": 2.548823595046997, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.22062202543020248, "step": 6995 }, { "epoch": 0.1044654656224102, "grad_norm": 0.396484375, "grad_norm_var": 0.0020266215006510416, "learning_rate": 0.0001, "loss": 1.6838, "loss/crossentropy": 2.5975513458251953, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.25803516060113907, "step": 6996 }, { "epoch": 0.10448039779302519, "grad_norm": 0.369140625, "grad_norm_var": 0.0020290215810139975, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.485698103904724, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2172814905643463, "step": 6997 }, { "epoch": 0.10449532996364017, "grad_norm": 0.369140625, "grad_norm_var": 0.00191496213277181, "learning_rate": 0.0001, "loss": 1.5886, "loss/crossentropy": 2.6187626123428345, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.23311349749565125, "step": 6998 }, { "epoch": 0.10451026213425514, "grad_norm": 0.34765625, "grad_norm_var": 0.00178526242574056, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.7867661714553833, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16737882792949677, "step": 6999 }, { "epoch": 0.10452519430487013, "grad_norm": 0.328125, "grad_norm_var": 0.0017316182454427083, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.6665759086608887, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1883121356368065, "step": 7000 }, { "epoch": 0.1045401264754851, "grad_norm": 0.353515625, "grad_norm_var": 0.0017043431599934897, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.367318630218506, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20642312616109848, "step": 7001 }, { "epoch": 0.1045550586461001, "grad_norm": 0.34765625, "grad_norm_var": 0.0017235914866129557, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.543729305267334, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20143911987543106, "step": 7002 }, { "epoch": 0.10456999081671507, "grad_norm": 0.33203125, "grad_norm_var": 0.0017592748006184896, "learning_rate": 0.0001, "loss": 1.4421, "loss/crossentropy": 2.5166640281677246, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18816711753606796, "step": 7003 }, { "epoch": 0.10458492298733005, "grad_norm": 0.353515625, "grad_norm_var": 0.001766188939412435, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.844022512435913, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1890028938651085, "step": 7004 }, { "epoch": 0.10459985515794504, "grad_norm": 0.44921875, "grad_norm_var": 0.0020641167958577473, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.5402482748031616, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19252315163612366, "step": 7005 }, { "epoch": 0.10461478732856001, "grad_norm": 0.46484375, "grad_norm_var": 0.0025980472564697266, "learning_rate": 0.0001, "loss": 1.6947, "loss/crossentropy": 2.680159568786621, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.22598987817764282, "step": 7006 }, { "epoch": 0.104629719499175, "grad_norm": 0.3203125, "grad_norm_var": 0.0026391983032226563, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.613840103149414, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18596193939447403, "step": 7007 }, { "epoch": 0.10464465166978998, "grad_norm": 0.31640625, "grad_norm_var": 0.0027666727701822916, "learning_rate": 0.0001, "loss": 1.382, "loss/crossentropy": 2.3953038454055786, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17501837760210037, "step": 7008 }, { "epoch": 0.10465958384040495, "grad_norm": 0.380859375, "grad_norm_var": 0.002541033426920573, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.6187100410461426, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22898443788290024, "step": 7009 }, { "epoch": 0.10467451601101994, "grad_norm": 0.34375, "grad_norm_var": 0.0025170485178629557, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.8170559406280518, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18411055207252502, "step": 7010 }, { "epoch": 0.10468944818163492, "grad_norm": 0.404296875, "grad_norm_var": 0.0018613020579020183, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.6871098279953003, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17496836185455322, "step": 7011 }, { "epoch": 0.10470438035224991, "grad_norm": 0.369140625, "grad_norm_var": 0.0018016656239827475, "learning_rate": 0.0001, "loss": 1.6221, "loss/crossentropy": 2.3918163776397705, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.23144687712192535, "step": 7012 }, { "epoch": 0.10471931252286489, "grad_norm": 0.3828125, "grad_norm_var": 0.0018198013305664063, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.528801918029785, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19333048909902573, "step": 7013 }, { "epoch": 0.10473424469347986, "grad_norm": 0.50390625, "grad_norm_var": 0.0030031681060791017, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.680696129798889, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.17778225988149643, "step": 7014 }, { "epoch": 0.10474917686409485, "grad_norm": 0.369140625, "grad_norm_var": 0.0029540379842122396, "learning_rate": 0.0001, "loss": 1.5928, "loss/crossentropy": 2.837080955505371, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.23345381766557693, "step": 7015 }, { "epoch": 0.10476410903470983, "grad_norm": 0.419921875, "grad_norm_var": 0.0028920332590738934, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.5504093170166016, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2391113042831421, "step": 7016 }, { "epoch": 0.10477904120532482, "grad_norm": 0.3984375, "grad_norm_var": 0.0028477986653645832, "learning_rate": 0.0001, "loss": 1.4704, "loss/crossentropy": 2.614853858947754, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.18523911386728287, "step": 7017 }, { "epoch": 0.10479397337593979, "grad_norm": 0.326171875, "grad_norm_var": 0.002982950210571289, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.733723521232605, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.18928222358226776, "step": 7018 }, { "epoch": 0.10480890554655478, "grad_norm": 0.333984375, "grad_norm_var": 0.002969805399576823, "learning_rate": 0.0001, "loss": 1.5287, "loss/crossentropy": 2.682516098022461, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20059484243392944, "step": 7019 }, { "epoch": 0.10482383771716976, "grad_norm": 0.3359375, "grad_norm_var": 0.003059498469034831, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.6184672117233276, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.16946674138307571, "step": 7020 }, { "epoch": 0.10483876988778473, "grad_norm": 0.345703125, "grad_norm_var": 0.0028076171875, "learning_rate": 0.0001, "loss": 1.6724, "loss/crossentropy": 2.2025439739227295, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.2427491918206215, "step": 7021 }, { "epoch": 0.10485370205839972, "grad_norm": 0.349609375, "grad_norm_var": 0.002272144953409831, "learning_rate": 0.0001, "loss": 1.6481, "loss/crossentropy": 2.518189311027527, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.25360170751810074, "step": 7022 }, { "epoch": 0.1048686342290147, "grad_norm": 0.3828125, "grad_norm_var": 0.0021124362945556642, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.7582250833511353, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18495525419712067, "step": 7023 }, { "epoch": 0.10488356639962969, "grad_norm": 0.3515625, "grad_norm_var": 0.0019258975982666016, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.789952516555786, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18210308998823166, "step": 7024 }, { "epoch": 0.10489849857024466, "grad_norm": 0.365234375, "grad_norm_var": 0.00192869504292806, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.737009644508362, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.192487470805645, "step": 7025 }, { "epoch": 0.10491343074085964, "grad_norm": 0.375, "grad_norm_var": 0.0018640995025634766, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.4625452756881714, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16641991585493088, "step": 7026 }, { "epoch": 0.10492836291147463, "grad_norm": 0.330078125, "grad_norm_var": 0.0019269148508707682, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.5568584203720093, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20283059775829315, "step": 7027 }, { "epoch": 0.1049432950820896, "grad_norm": 0.3515625, "grad_norm_var": 0.0019510904947916666, "learning_rate": 0.0001, "loss": 1.4746, "loss/crossentropy": 2.5275198221206665, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18555714190006256, "step": 7028 }, { "epoch": 0.1049582272527046, "grad_norm": 0.365234375, "grad_norm_var": 0.0019406477610270183, "learning_rate": 0.0001, "loss": 1.5326, "loss/crossentropy": 2.738517165184021, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20444491505622864, "step": 7029 }, { "epoch": 0.10497315942331957, "grad_norm": 0.423828125, "grad_norm_var": 0.0009012222290039062, "learning_rate": 0.0001, "loss": 1.6183, "loss/crossentropy": 2.696116805076599, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2198537215590477, "step": 7030 }, { "epoch": 0.10498809159393455, "grad_norm": 0.345703125, "grad_norm_var": 0.0009195327758789063, "learning_rate": 0.0001, "loss": 1.3837, "loss/crossentropy": 2.663272976875305, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.18450850248336792, "step": 7031 }, { "epoch": 0.10500302376454954, "grad_norm": 0.4453125, "grad_norm_var": 0.0011540571848551431, "learning_rate": 0.0001, "loss": 1.8319, "loss/crossentropy": 2.632164478302002, "loss/fcd": 1.5234375, "loss/idx": 10.0, "loss/logits": 0.30849873274564743, "step": 7032 }, { "epoch": 0.10501795593516451, "grad_norm": 0.322265625, "grad_norm_var": 0.0011683146158854166, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.363517105579376, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19663581252098083, "step": 7033 }, { "epoch": 0.1050328881057795, "grad_norm": 0.365234375, "grad_norm_var": 0.0010907491048177083, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.640725612640381, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18582841753959656, "step": 7034 }, { "epoch": 0.10504782027639448, "grad_norm": 0.3515625, "grad_norm_var": 0.0010448296864827473, "learning_rate": 0.0001, "loss": 1.5297, "loss/crossentropy": 2.6453020572662354, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21332895755767822, "step": 7035 }, { "epoch": 0.10506275244700945, "grad_norm": 0.361328125, "grad_norm_var": 0.0009937922159830729, "learning_rate": 0.0001, "loss": 1.6665, "loss/crossentropy": 2.3964754343032837, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.24464985728263855, "step": 7036 }, { "epoch": 0.10507768461762444, "grad_norm": 0.326171875, "grad_norm_var": 0.00106658935546875, "learning_rate": 0.0001, "loss": 1.4855, "loss/crossentropy": 2.4717276096343994, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18864520639181137, "step": 7037 }, { "epoch": 0.10509261678823942, "grad_norm": 0.40625, "grad_norm_var": 0.0011638482411702475, "learning_rate": 0.0001, "loss": 1.7419, "loss/crossentropy": 2.1796916127204895, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.2575122117996216, "step": 7038 }, { "epoch": 0.10510754895885441, "grad_norm": 0.3828125, "grad_norm_var": 0.0011638482411702475, "learning_rate": 0.0001, "loss": 1.4733, "loss/crossentropy": 2.714401960372925, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19597356021404266, "step": 7039 }, { "epoch": 0.10512248112946938, "grad_norm": 0.3359375, "grad_norm_var": 0.0012108961741129558, "learning_rate": 0.0001, "loss": 1.3394, "loss/crossentropy": 2.6389299631118774, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.15193922072649002, "step": 7040 }, { "epoch": 0.10513741330008437, "grad_norm": 0.4921875, "grad_norm_var": 0.0022078831990559895, "learning_rate": 0.0001, "loss": 1.7599, "loss/crossentropy": 2.5327422618865967, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.22473035007715225, "step": 7041 }, { "epoch": 0.10515234547069935, "grad_norm": 0.435546875, "grad_norm_var": 0.0024468580881754558, "learning_rate": 0.0001, "loss": 1.5442, "loss/crossentropy": 2.7003519535064697, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20826581120491028, "step": 7042 }, { "epoch": 0.10516727764131432, "grad_norm": 0.3125, "grad_norm_var": 0.0025774637858072915, "learning_rate": 0.0001, "loss": 1.3772, "loss/crossentropy": 2.8316789865493774, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1819123476743698, "step": 7043 }, { "epoch": 0.10518220981192931, "grad_norm": 0.345703125, "grad_norm_var": 0.002599064509073893, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.482966899871826, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.16220900416374207, "step": 7044 }, { "epoch": 0.10519714198254429, "grad_norm": 0.400390625, "grad_norm_var": 0.0026253859202067056, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.5898576974868774, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18826167285442352, "step": 7045 }, { "epoch": 0.10521207415315928, "grad_norm": 0.302734375, "grad_norm_var": 0.0028067111968994142, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.5202428102493286, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20142042636871338, "step": 7046 }, { "epoch": 0.10522700632377426, "grad_norm": 0.33203125, "grad_norm_var": 0.0028640111287434895, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.4456862211227417, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19173671305179596, "step": 7047 }, { "epoch": 0.10524193849438923, "grad_norm": 0.369140625, "grad_norm_var": 0.0024604638417561847, "learning_rate": 0.0001, "loss": 1.463, "loss/crossentropy": 2.597747564315796, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20132455229759216, "step": 7048 }, { "epoch": 0.10525687066500422, "grad_norm": 0.314453125, "grad_norm_var": 0.0025089104970296224, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.4873874187469482, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.20385027676820755, "step": 7049 }, { "epoch": 0.1052718028356192, "grad_norm": 0.359375, "grad_norm_var": 0.0025105794270833333, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.74620521068573, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.204787015914917, "step": 7050 }, { "epoch": 0.10528673500623419, "grad_norm": 0.353515625, "grad_norm_var": 0.002507511774698893, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.5524799823760986, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.21663406491279602, "step": 7051 }, { "epoch": 0.10530166717684916, "grad_norm": 0.291015625, "grad_norm_var": 0.0028451124827067057, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.3097819089889526, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17862070351839066, "step": 7052 }, { "epoch": 0.10531659934746414, "grad_norm": 0.32421875, "grad_norm_var": 0.002854156494140625, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.618919610977173, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19435463845729828, "step": 7053 }, { "epoch": 0.10533153151807913, "grad_norm": 0.328125, "grad_norm_var": 0.0027524312337239582, "learning_rate": 0.0001, "loss": 1.5065, "loss/crossentropy": 2.5434683561325073, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2017739787697792, "step": 7054 }, { "epoch": 0.1053464636886941, "grad_norm": 0.32421875, "grad_norm_var": 0.0027495702107747395, "learning_rate": 0.0001, "loss": 1.3952, "loss/crossentropy": 2.8793909549713135, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1803899109363556, "step": 7055 }, { "epoch": 0.10536139585930909, "grad_norm": 0.39453125, "grad_norm_var": 0.0028439839680989582, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.577591300010681, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.1940908059477806, "step": 7056 }, { "epoch": 0.10537632802992407, "grad_norm": 0.326171875, "grad_norm_var": 0.0015294233957926433, "learning_rate": 0.0001, "loss": 1.411, "loss/crossentropy": 2.759833812713623, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18832293152809143, "step": 7057 }, { "epoch": 0.10539126020053906, "grad_norm": 0.353515625, "grad_norm_var": 0.0009553114573160808, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.6540273427963257, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16957658529281616, "step": 7058 }, { "epoch": 0.10540619237115403, "grad_norm": 0.38671875, "grad_norm_var": 0.0010326226552327474, "learning_rate": 0.0001, "loss": 1.4658, "loss/crossentropy": 2.7929316759109497, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1962825506925583, "step": 7059 }, { "epoch": 0.10542112454176901, "grad_norm": 0.357421875, "grad_norm_var": 0.00104368527730306, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.7010531425476074, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1684214398264885, "step": 7060 }, { "epoch": 0.105436056712384, "grad_norm": 0.375, "grad_norm_var": 0.0008959452311197917, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.6653565168380737, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18256491422653198, "step": 7061 }, { "epoch": 0.10545098888299897, "grad_norm": 0.328125, "grad_norm_var": 0.0007990360260009766, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.722267746925354, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21612193435430527, "step": 7062 }, { "epoch": 0.10546592105361396, "grad_norm": 0.3359375, "grad_norm_var": 0.0007933139801025391, "learning_rate": 0.0001, "loss": 1.4051, "loss/crossentropy": 2.6398322582244873, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1824241578578949, "step": 7063 }, { "epoch": 0.10548085322422894, "grad_norm": 0.314453125, "grad_norm_var": 0.0008048852284749349, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.50749933719635, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.19111397862434387, "step": 7064 }, { "epoch": 0.10549578539484392, "grad_norm": 0.3828125, "grad_norm_var": 0.000848833719889323, "learning_rate": 0.0001, "loss": 1.6464, "loss/crossentropy": 2.3676564693450928, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.2206367552280426, "step": 7065 }, { "epoch": 0.1055107175654589, "grad_norm": 0.4140625, "grad_norm_var": 0.0011336644490559896, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.673351764678955, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2013401985168457, "step": 7066 }, { "epoch": 0.10552564973607388, "grad_norm": 0.359375, "grad_norm_var": 0.001139052708943685, "learning_rate": 0.0001, "loss": 1.4014, "loss/crossentropy": 2.8495092391967773, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17487242817878723, "step": 7067 }, { "epoch": 0.10554058190668887, "grad_norm": 0.462890625, "grad_norm_var": 0.0016397953033447266, "learning_rate": 0.0001, "loss": 1.9127, "loss/crossentropy": 2.563942790031433, "loss/fcd": 1.640625, "loss/idx": 10.0, "loss/logits": 0.27203798294067383, "step": 7068 }, { "epoch": 0.10555551407730385, "grad_norm": 0.357421875, "grad_norm_var": 0.0015481948852539063, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.2223654985427856, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2101968228816986, "step": 7069 }, { "epoch": 0.10557044624791882, "grad_norm": 0.3671875, "grad_norm_var": 0.0014642715454101563, "learning_rate": 0.0001, "loss": 1.5691, "loss/crossentropy": 2.545715093612671, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.22531965374946594, "step": 7070 }, { "epoch": 0.10558537841853381, "grad_norm": 0.35546875, "grad_norm_var": 0.001355425516764323, "learning_rate": 0.0001, "loss": 1.5411, "loss/crossentropy": 2.632166862487793, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21691645681858063, "step": 7071 }, { "epoch": 0.10560031058914879, "grad_norm": 0.373046875, "grad_norm_var": 0.0013052463531494141, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.462417244911194, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21159081906080246, "step": 7072 }, { "epoch": 0.10561524275976378, "grad_norm": 0.32421875, "grad_norm_var": 0.001315752665201823, "learning_rate": 0.0001, "loss": 1.5028, "loss/crossentropy": 2.467747688293457, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19423165917396545, "step": 7073 }, { "epoch": 0.10563017493037875, "grad_norm": 0.3203125, "grad_norm_var": 0.0014376163482666016, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.539307475090027, "loss/fcd": 1.15625, "loss/idx": 10.0, "loss/logits": 0.16647501289844513, "step": 7074 }, { "epoch": 0.10564510710099373, "grad_norm": 0.369140625, "grad_norm_var": 0.00140228271484375, "learning_rate": 0.0001, "loss": 1.4574, "loss/crossentropy": 2.506648898124695, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19172754138708115, "step": 7075 }, { "epoch": 0.10566003927160872, "grad_norm": 0.40234375, "grad_norm_var": 0.001499160130818685, "learning_rate": 0.0001, "loss": 1.4946, "loss/crossentropy": 2.5156253576278687, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2055065855383873, "step": 7076 }, { "epoch": 0.1056749714422237, "grad_norm": 0.33984375, "grad_norm_var": 0.0015300591786702475, "learning_rate": 0.0001, "loss": 1.5058, "loss/crossentropy": 2.6137722730636597, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21286041289567947, "step": 7077 }, { "epoch": 0.10568990361283868, "grad_norm": 0.423828125, "grad_norm_var": 0.0016585667928059897, "learning_rate": 0.0001, "loss": 1.7055, "loss/crossentropy": 2.6140938997268677, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.24842800945043564, "step": 7078 }, { "epoch": 0.10570483578345366, "grad_norm": 0.388671875, "grad_norm_var": 0.00160063107808431, "learning_rate": 0.0001, "loss": 1.5711, "loss/crossentropy": 2.599150776863098, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2156325876712799, "step": 7079 }, { "epoch": 0.10571976795406865, "grad_norm": 0.341796875, "grad_norm_var": 0.0014368534088134766, "learning_rate": 0.0001, "loss": 1.6634, "loss/crossentropy": 2.8056392669677734, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.28061386942863464, "step": 7080 }, { "epoch": 0.10573470012468363, "grad_norm": 0.337890625, "grad_norm_var": 0.0015096028645833334, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.425775647163391, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2282400280237198, "step": 7081 }, { "epoch": 0.1057496322952986, "grad_norm": 0.34765625, "grad_norm_var": 0.0014047622680664062, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.6024203300476074, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.1811571940779686, "step": 7082 }, { "epoch": 0.10576456446591359, "grad_norm": 0.365234375, "grad_norm_var": 0.0014009952545166015, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.365049362182617, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20208748430013657, "step": 7083 }, { "epoch": 0.10577949663652857, "grad_norm": 0.3828125, "grad_norm_var": 0.00078125, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.5436431169509888, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19076219201087952, "step": 7084 }, { "epoch": 0.10579442880714356, "grad_norm": 0.390625, "grad_norm_var": 0.0008285363515218099, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.666842222213745, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.23201584070920944, "step": 7085 }, { "epoch": 0.10580936097775853, "grad_norm": 0.330078125, "grad_norm_var": 0.0009007136027018229, "learning_rate": 0.0001, "loss": 1.4713, "loss/crossentropy": 2.478953003883362, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.1783730313181877, "step": 7086 }, { "epoch": 0.10582429314837351, "grad_norm": 0.357421875, "grad_norm_var": 0.0008992354075113932, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.325719714164734, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19595985114574432, "step": 7087 }, { "epoch": 0.1058392253189885, "grad_norm": 0.3046875, "grad_norm_var": 0.0010922749837239583, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.559959292411804, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17009373009204865, "step": 7088 }, { "epoch": 0.10585415748960347, "grad_norm": 0.35546875, "grad_norm_var": 0.0010129292805989584, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.4426406621932983, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1823362559080124, "step": 7089 }, { "epoch": 0.10586908966021846, "grad_norm": 0.359375, "grad_norm_var": 0.0009023030598958333, "learning_rate": 0.0001, "loss": 1.6556, "loss/crossentropy": 2.5378520488739014, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.24544782936573029, "step": 7090 }, { "epoch": 0.10588402183083344, "grad_norm": 0.361328125, "grad_norm_var": 0.0008989969889322916, "learning_rate": 0.0001, "loss": 1.4946, "loss/crossentropy": 2.7549543380737305, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20557404309511185, "step": 7091 }, { "epoch": 0.10589895400144841, "grad_norm": 0.375, "grad_norm_var": 0.0007979710896809896, "learning_rate": 0.0001, "loss": 1.4064, "loss/crossentropy": 2.583276152610779, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17983217537403107, "step": 7092 }, { "epoch": 0.1059138861720634, "grad_norm": 0.46484375, "grad_norm_var": 0.0014368057250976562, "learning_rate": 0.0001, "loss": 1.7541, "loss/crossentropy": 2.7162290811538696, "loss/fcd": 1.49609375, "loss/idx": 10.0, "loss/logits": 0.25797466933727264, "step": 7093 }, { "epoch": 0.10592881834267838, "grad_norm": 0.3203125, "grad_norm_var": 0.0013348738352457683, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.474065661430359, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18084512650966644, "step": 7094 }, { "epoch": 0.10594375051329337, "grad_norm": 0.3828125, "grad_norm_var": 0.001315752665201823, "learning_rate": 0.0001, "loss": 1.5754, "loss/crossentropy": 2.646570563316345, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20818112790584564, "step": 7095 }, { "epoch": 0.10595868268390835, "grad_norm": 0.353515625, "grad_norm_var": 0.0012941996256510416, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.85100781917572, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19175072014331818, "step": 7096 }, { "epoch": 0.10597361485452332, "grad_norm": 0.337890625, "grad_norm_var": 0.0012941996256510416, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.5452184677124023, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.2183985710144043, "step": 7097 }, { "epoch": 0.10598854702513831, "grad_norm": 0.55078125, "grad_norm_var": 0.0034894307454427084, "learning_rate": 0.0001, "loss": 1.8997, "loss/crossentropy": 2.83378005027771, "loss/fcd": 1.6171875, "loss/idx": 10.0, "loss/logits": 0.28248271346092224, "step": 7098 }, { "epoch": 0.10600347919575329, "grad_norm": 0.3359375, "grad_norm_var": 0.0035793145497639975, "learning_rate": 0.0001, "loss": 1.3847, "loss/crossentropy": 2.448978066444397, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.1776464506983757, "step": 7099 }, { "epoch": 0.10601841136636828, "grad_norm": 0.34765625, "grad_norm_var": 0.0036090691884358723, "learning_rate": 0.0001, "loss": 1.2871, "loss/crossentropy": 2.712105870246887, "loss/fcd": 1.140625, "loss/idx": 10.0, "loss/logits": 0.14642829447984695, "step": 7100 }, { "epoch": 0.10603334353698325, "grad_norm": 0.318359375, "grad_norm_var": 0.0037413914998372395, "learning_rate": 0.0001, "loss": 1.4157, "loss/crossentropy": 2.5182785987854004, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.19307124614715576, "step": 7101 }, { "epoch": 0.10604827570759824, "grad_norm": 0.345703125, "grad_norm_var": 0.0036818822224934894, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.615993857383728, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.2088722214102745, "step": 7102 }, { "epoch": 0.10606320787821322, "grad_norm": 0.474609375, "grad_norm_var": 0.00439141591389974, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.6689027547836304, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.22481638193130493, "step": 7103 }, { "epoch": 0.10607814004882819, "grad_norm": 0.33984375, "grad_norm_var": 0.0041425069173177086, "learning_rate": 0.0001, "loss": 1.4797, "loss/crossentropy": 2.633814215660095, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19456040859222412, "step": 7104 }, { "epoch": 0.10609307221944318, "grad_norm": 0.322265625, "grad_norm_var": 0.004304361343383789, "learning_rate": 0.0001, "loss": 1.3535, "loss/crossentropy": 2.555541753768921, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.169914610683918, "step": 7105 }, { "epoch": 0.10610800439005816, "grad_norm": 0.353515625, "grad_norm_var": 0.0043182373046875, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.653850793838501, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19379503279924393, "step": 7106 }, { "epoch": 0.10612293656067315, "grad_norm": 0.328125, "grad_norm_var": 0.00444334348042806, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.6248501539230347, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1663564294576645, "step": 7107 }, { "epoch": 0.10613786873128812, "grad_norm": 0.310546875, "grad_norm_var": 0.00467675526936849, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.36641263961792, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17707691341638565, "step": 7108 }, { "epoch": 0.1061528009019031, "grad_norm": 0.3828125, "grad_norm_var": 0.004037221272786458, "learning_rate": 0.0001, "loss": 1.5472, "loss/crossentropy": 2.7088600397109985, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22299277037382126, "step": 7109 }, { "epoch": 0.10616773307251809, "grad_norm": 0.380859375, "grad_norm_var": 0.003923400243123373, "learning_rate": 0.0001, "loss": 1.562, "loss/crossentropy": 2.6397719383239746, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2104681134223938, "step": 7110 }, { "epoch": 0.10618266524313306, "grad_norm": 0.349609375, "grad_norm_var": 0.003920427958170573, "learning_rate": 0.0001, "loss": 1.5308, "loss/crossentropy": 2.9960466623306274, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20265864580869675, "step": 7111 }, { "epoch": 0.10619759741374805, "grad_norm": 0.341796875, "grad_norm_var": 0.003946177164713542, "learning_rate": 0.0001, "loss": 1.5887, "loss/crossentropy": 2.5434523820877075, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.2215305119752884, "step": 7112 }, { "epoch": 0.10621252958436303, "grad_norm": 0.369140625, "grad_norm_var": 0.003899383544921875, "learning_rate": 0.0001, "loss": 1.5134, "loss/crossentropy": 2.714392066001892, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20871072262525558, "step": 7113 }, { "epoch": 0.106227461754978, "grad_norm": 0.39453125, "grad_norm_var": 0.0015698750813802084, "learning_rate": 0.0001, "loss": 1.7062, "loss/crossentropy": 2.2325726747512817, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.2686538100242615, "step": 7114 }, { "epoch": 0.106242393925593, "grad_norm": 0.36328125, "grad_norm_var": 0.0015436172485351562, "learning_rate": 0.0001, "loss": 1.5019, "loss/crossentropy": 2.6657607555389404, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19719459116458893, "step": 7115 }, { "epoch": 0.10625732609620797, "grad_norm": 0.423828125, "grad_norm_var": 0.0018045902252197266, "learning_rate": 0.0001, "loss": 1.6228, "loss/crossentropy": 2.7876899242401123, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2478167936205864, "step": 7116 }, { "epoch": 0.10627225826682296, "grad_norm": 0.337890625, "grad_norm_var": 0.0017136732737223308, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.6687724590301514, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1914082169532776, "step": 7117 }, { "epoch": 0.10628719043743794, "grad_norm": 0.375, "grad_norm_var": 0.001697222391764323, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.561816692352295, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1552055925130844, "step": 7118 }, { "epoch": 0.10630212260805293, "grad_norm": 0.34765625, "grad_norm_var": 0.0008572737375895182, "learning_rate": 0.0001, "loss": 1.7635, "loss/crossentropy": 2.4587820768356323, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.2518172264099121, "step": 7119 }, { "epoch": 0.1063170547786679, "grad_norm": 0.32421875, "grad_norm_var": 0.0009094079335530599, "learning_rate": 0.0001, "loss": 1.4343, "loss/crossentropy": 2.6655439138412476, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19599266350269318, "step": 7120 }, { "epoch": 0.10633198694928288, "grad_norm": 0.30859375, "grad_norm_var": 0.0009836196899414063, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.4793730974197388, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17393121123313904, "step": 7121 }, { "epoch": 0.10634691911989787, "grad_norm": 0.30078125, "grad_norm_var": 0.0011728763580322265, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.4977093935012817, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18683480471372604, "step": 7122 }, { "epoch": 0.10636185129051284, "grad_norm": 0.37109375, "grad_norm_var": 0.0011490980784098308, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.63076114654541, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2352019026875496, "step": 7123 }, { "epoch": 0.10637678346112783, "grad_norm": 0.337890625, "grad_norm_var": 0.0010333855946858723, "learning_rate": 0.0001, "loss": 1.4719, "loss/crossentropy": 2.456574320793152, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1906299740076065, "step": 7124 }, { "epoch": 0.10639171563174281, "grad_norm": 0.345703125, "grad_norm_var": 0.0009908040364583334, "learning_rate": 0.0001, "loss": 1.5159, "loss/crossentropy": 2.5330499410629272, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19947615265846252, "step": 7125 }, { "epoch": 0.10640664780235778, "grad_norm": 0.345703125, "grad_norm_var": 0.0009444554646809896, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.6491293907165527, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20899266004562378, "step": 7126 }, { "epoch": 0.10642157997297277, "grad_norm": 0.330078125, "grad_norm_var": 0.0009752909342447916, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.6083394289016724, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.2017742171883583, "step": 7127 }, { "epoch": 0.10643651214358775, "grad_norm": 0.5078125, "grad_norm_var": 0.002492507298787435, "learning_rate": 0.0001, "loss": 2.0163, "loss/crossentropy": 2.495531678199768, "loss/fcd": 1.6875, "loss/idx": 10.0, "loss/logits": 0.3287610858678818, "step": 7128 }, { "epoch": 0.10645144431420274, "grad_norm": 0.3515625, "grad_norm_var": 0.0024937947591145834, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.599992513656616, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1948041319847107, "step": 7129 }, { "epoch": 0.10646637648481772, "grad_norm": 0.35546875, "grad_norm_var": 0.002411142985026042, "learning_rate": 0.0001, "loss": 1.5005, "loss/crossentropy": 2.572971224784851, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.195856511592865, "step": 7130 }, { "epoch": 0.10648130865543269, "grad_norm": 0.296875, "grad_norm_var": 0.0026391983032226563, "learning_rate": 0.0001, "loss": 1.3029, "loss/crossentropy": 2.3974199295043945, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.15052543580532074, "step": 7131 }, { "epoch": 0.10649624082604768, "grad_norm": 0.361328125, "grad_norm_var": 0.0022994359334309895, "learning_rate": 0.0001, "loss": 1.568, "loss/crossentropy": 2.622364640235901, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21641940623521805, "step": 7132 }, { "epoch": 0.10651117299666266, "grad_norm": 0.33203125, "grad_norm_var": 0.002310927708943685, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.7001984119415283, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1886611059308052, "step": 7133 }, { "epoch": 0.10652610516727765, "grad_norm": 0.427734375, "grad_norm_var": 0.0026641209920247396, "learning_rate": 0.0001, "loss": 1.9984, "loss/crossentropy": 2.6306231021881104, "loss/fcd": 1.703125, "loss/idx": 10.0, "loss/logits": 0.2952718138694763, "step": 7134 }, { "epoch": 0.10654103733789262, "grad_norm": 0.337890625, "grad_norm_var": 0.0026767571767171224, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.6227632761001587, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20083357393741608, "step": 7135 }, { "epoch": 0.1065559695085076, "grad_norm": 0.306640625, "grad_norm_var": 0.002761586507161458, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.477224349975586, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17844785004854202, "step": 7136 }, { "epoch": 0.10657090167912259, "grad_norm": 0.369140625, "grad_norm_var": 0.0026477654774983723, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.864624261856079, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18843212723731995, "step": 7137 }, { "epoch": 0.10658583384973756, "grad_norm": 0.365234375, "grad_norm_var": 0.0024426778157552085, "learning_rate": 0.0001, "loss": 1.4861, "loss/crossentropy": 2.542586922645569, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19700032472610474, "step": 7138 }, { "epoch": 0.10660076602035255, "grad_norm": 0.322265625, "grad_norm_var": 0.002512216567993164, "learning_rate": 0.0001, "loss": 1.3329, "loss/crossentropy": 2.5550469160079956, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.16496922075748444, "step": 7139 }, { "epoch": 0.10661569819096753, "grad_norm": 0.341796875, "grad_norm_var": 0.002503824234008789, "learning_rate": 0.0001, "loss": 1.4912, "loss/crossentropy": 2.7044023275375366, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20608694851398468, "step": 7140 }, { "epoch": 0.10663063036158252, "grad_norm": 0.34765625, "grad_norm_var": 0.002501360575358073, "learning_rate": 0.0001, "loss": 1.6963, "loss/crossentropy": 2.328610062599182, "loss/fcd": 1.47265625, "loss/idx": 10.0, "loss/logits": 0.2236679047346115, "step": 7141 }, { "epoch": 0.1066455625321975, "grad_norm": 0.333984375, "grad_norm_var": 0.0025263468424479166, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.5680397748947144, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1829049214720726, "step": 7142 }, { "epoch": 0.10666049470281247, "grad_norm": 0.322265625, "grad_norm_var": 0.002556610107421875, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.4917304515838623, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20642514526844025, "step": 7143 }, { "epoch": 0.10667542687342746, "grad_norm": 0.41015625, "grad_norm_var": 0.0011626561482747397, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.446821093559265, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.23598123341798782, "step": 7144 }, { "epoch": 0.10669035904404243, "grad_norm": 0.326171875, "grad_norm_var": 0.001193857192993164, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.742687940597534, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1896917223930359, "step": 7145 }, { "epoch": 0.10670529121465742, "grad_norm": 0.376953125, "grad_norm_var": 0.0012461344401041667, "learning_rate": 0.0001, "loss": 1.5667, "loss/crossentropy": 2.6964752674102783, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.21908524632453918, "step": 7146 }, { "epoch": 0.1067202233852724, "grad_norm": 0.33984375, "grad_norm_var": 0.0010649998982747396, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.4799952507019043, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17896872013807297, "step": 7147 }, { "epoch": 0.10673515555588738, "grad_norm": 0.3515625, "grad_norm_var": 0.0010579268137613932, "learning_rate": 0.0001, "loss": 1.4904, "loss/crossentropy": 2.774262309074402, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20137207955121994, "step": 7148 }, { "epoch": 0.10675008772650237, "grad_norm": 0.369140625, "grad_norm_var": 0.0010515848795572917, "learning_rate": 0.0001, "loss": 1.3225, "loss/crossentropy": 2.6411718130111694, "loss/fcd": 1.1640625, "loss/idx": 10.0, "loss/logits": 0.1584150716662407, "step": 7149 }, { "epoch": 0.10676501989711734, "grad_norm": 0.359375, "grad_norm_var": 0.0006627241770426432, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.7551662921905518, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2223270684480667, "step": 7150 }, { "epoch": 0.10677995206773233, "grad_norm": 0.330078125, "grad_norm_var": 0.0006778558095296224, "learning_rate": 0.0001, "loss": 1.3682, "loss/crossentropy": 2.6212284564971924, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17680329084396362, "step": 7151 }, { "epoch": 0.1067948842383473, "grad_norm": 0.337890625, "grad_norm_var": 0.0005654493967692057, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.4001437425613403, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17861278355121613, "step": 7152 }, { "epoch": 0.10680981640896228, "grad_norm": 0.361328125, "grad_norm_var": 0.0005495548248291016, "learning_rate": 0.0001, "loss": 1.5274, "loss/crossentropy": 2.735026240348816, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21878747642040253, "step": 7153 }, { "epoch": 0.10682474857957727, "grad_norm": 0.328125, "grad_norm_var": 0.000558916727701823, "learning_rate": 0.0001, "loss": 1.4584, "loss/crossentropy": 2.5803555250167847, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19663339108228683, "step": 7154 }, { "epoch": 0.10683968075019225, "grad_norm": 0.34375, "grad_norm_var": 0.0005157311757405599, "learning_rate": 0.0001, "loss": 1.5118, "loss/crossentropy": 2.316120743751526, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.19926510006189346, "step": 7155 }, { "epoch": 0.10685461292080724, "grad_norm": 0.376953125, "grad_norm_var": 0.0005603631337483724, "learning_rate": 0.0001, "loss": 1.6295, "loss/crossentropy": 2.5800037384033203, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.22326132655143738, "step": 7156 }, { "epoch": 0.10686954509142221, "grad_norm": 0.349609375, "grad_norm_var": 0.0005597432454427083, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.620572805404663, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.1967054083943367, "step": 7157 }, { "epoch": 0.10688447726203719, "grad_norm": 0.3359375, "grad_norm_var": 0.0005555311838785807, "learning_rate": 0.0001, "loss": 1.5606, "loss/crossentropy": 2.6641225814819336, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22852330654859543, "step": 7158 }, { "epoch": 0.10689940943265218, "grad_norm": 0.455078125, "grad_norm_var": 0.0011456648508707682, "learning_rate": 0.0001, "loss": 1.7029, "loss/crossentropy": 2.4662355184555054, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.23412150144577026, "step": 7159 }, { "epoch": 0.10691434160326715, "grad_norm": 0.32421875, "grad_norm_var": 0.0010267734527587891, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.582581043243408, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17258531600236893, "step": 7160 }, { "epoch": 0.10692927377388214, "grad_norm": 0.341796875, "grad_norm_var": 0.0009837945302327473, "learning_rate": 0.0001, "loss": 1.5791, "loss/crossentropy": 2.6646156311035156, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.22751913219690323, "step": 7161 }, { "epoch": 0.10694420594449712, "grad_norm": 0.396484375, "grad_norm_var": 0.0010645389556884766, "learning_rate": 0.0001, "loss": 1.8695, "loss/crossentropy": 2.665061593055725, "loss/fcd": 1.52734375, "loss/idx": 10.0, "loss/logits": 0.34217506647109985, "step": 7162 }, { "epoch": 0.10695913811511211, "grad_norm": 0.359375, "grad_norm_var": 0.0010454654693603516, "learning_rate": 0.0001, "loss": 1.6004, "loss/crossentropy": 2.527356266975403, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.23713934421539307, "step": 7163 }, { "epoch": 0.10697407028572709, "grad_norm": 0.357421875, "grad_norm_var": 0.001042938232421875, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.3412665128707886, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1961105912923813, "step": 7164 }, { "epoch": 0.10698900245634206, "grad_norm": 0.322265625, "grad_norm_var": 0.001110076904296875, "learning_rate": 0.0001, "loss": 1.4971, "loss/crossentropy": 2.5572177171707153, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20024646818637848, "step": 7165 }, { "epoch": 0.10700393462695705, "grad_norm": 0.345703125, "grad_norm_var": 0.001113748550415039, "learning_rate": 0.0001, "loss": 1.5393, "loss/crossentropy": 2.6976065635681152, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21901633590459824, "step": 7166 }, { "epoch": 0.10701886679757203, "grad_norm": 0.76171875, "grad_norm_var": 0.011374346415201823, "learning_rate": 0.0001, "loss": 1.9511, "loss/crossentropy": 2.598344922065735, "loss/fcd": 1.62890625, "loss/idx": 10.0, "loss/logits": 0.32217568159103394, "step": 7167 }, { "epoch": 0.10703379896818702, "grad_norm": 0.421875, "grad_norm_var": 0.011331288019816081, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.501786708831787, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18349133431911469, "step": 7168 }, { "epoch": 0.10704873113880199, "grad_norm": 0.306640625, "grad_norm_var": 0.011700677871704101, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.3872262239456177, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.15910042822360992, "step": 7169 }, { "epoch": 0.10706366330941697, "grad_norm": 0.36328125, "grad_norm_var": 0.011521005630493164, "learning_rate": 0.0001, "loss": 1.3549, "loss/crossentropy": 2.4551128149032593, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.16345347464084625, "step": 7170 }, { "epoch": 0.10707859548003196, "grad_norm": 0.35546875, "grad_norm_var": 0.011464929580688477, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.594131588935852, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1871713250875473, "step": 7171 }, { "epoch": 0.10709352765064693, "grad_norm": 0.33984375, "grad_norm_var": 0.01159509023030599, "learning_rate": 0.0001, "loss": 1.4633, "loss/crossentropy": 2.6063965559005737, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19770146161317825, "step": 7172 }, { "epoch": 0.10710845982126192, "grad_norm": 0.34765625, "grad_norm_var": 0.01160416603088379, "learning_rate": 0.0001, "loss": 1.5199, "loss/crossentropy": 2.5969645977020264, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20743772387504578, "step": 7173 }, { "epoch": 0.1071233919918769, "grad_norm": 0.349609375, "grad_norm_var": 0.011529286702473959, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.852925419807434, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.21415682137012482, "step": 7174 }, { "epoch": 0.10713832416249187, "grad_norm": 0.376953125, "grad_norm_var": 0.011173248291015625, "learning_rate": 0.0001, "loss": 1.4797, "loss/crossentropy": 2.9029042720794678, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19455412030220032, "step": 7175 }, { "epoch": 0.10715325633310686, "grad_norm": 0.412109375, "grad_norm_var": 0.011009454727172852, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.69363534450531, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.19633860141038895, "step": 7176 }, { "epoch": 0.10716818850372184, "grad_norm": 0.302734375, "grad_norm_var": 0.011329253514607748, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.4677112102508545, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.1648927628993988, "step": 7177 }, { "epoch": 0.10718312067433683, "grad_norm": 0.37109375, "grad_norm_var": 0.011322021484375, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.2375375032424927, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.1918317899107933, "step": 7178 }, { "epoch": 0.1071980528449518, "grad_norm": 0.361328125, "grad_norm_var": 0.011316665013631185, "learning_rate": 0.0001, "loss": 1.6436, "loss/crossentropy": 2.5457693338394165, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.2452111914753914, "step": 7179 }, { "epoch": 0.1072129850155668, "grad_norm": 0.306640625, "grad_norm_var": 0.011637353897094726, "learning_rate": 0.0001, "loss": 1.4749, "loss/crossentropy": 2.594764471054077, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20925475656986237, "step": 7180 }, { "epoch": 0.10722791718618177, "grad_norm": 0.359375, "grad_norm_var": 0.01144860585530599, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.6018787622451782, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22007660567760468, "step": 7181 }, { "epoch": 0.10724284935679675, "grad_norm": 0.328125, "grad_norm_var": 0.011548598607381185, "learning_rate": 0.0001, "loss": 1.4404, "loss/crossentropy": 2.5455026626586914, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19821010529994965, "step": 7182 }, { "epoch": 0.10725778152741174, "grad_norm": 0.333984375, "grad_norm_var": 0.0011580785115559895, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.5947195291519165, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17338912189006805, "step": 7183 }, { "epoch": 0.10727271369802671, "grad_norm": 0.310546875, "grad_norm_var": 0.0008998711903889974, "learning_rate": 0.0001, "loss": 1.3906, "loss/crossentropy": 2.4622703790664673, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17574024945497513, "step": 7184 }, { "epoch": 0.1072876458686417, "grad_norm": 0.404296875, "grad_norm_var": 0.0009920597076416016, "learning_rate": 0.0001, "loss": 1.5385, "loss/crossentropy": 2.629993200302124, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.19087401032447815, "step": 7185 }, { "epoch": 0.10730257803925668, "grad_norm": 0.349609375, "grad_norm_var": 0.0009821573893229166, "learning_rate": 0.0001, "loss": 1.4983, "loss/crossentropy": 2.6929088830947876, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20142386853694916, "step": 7186 }, { "epoch": 0.10731751020987165, "grad_norm": 0.310546875, "grad_norm_var": 0.0010790348052978516, "learning_rate": 0.0001, "loss": 1.345, "loss/crossentropy": 2.6334917545318604, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.1652694046497345, "step": 7187 }, { "epoch": 0.10733244238048664, "grad_norm": 0.361328125, "grad_norm_var": 0.0010851542154947916, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.5751835107803345, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20999088138341904, "step": 7188 }, { "epoch": 0.10734737455110162, "grad_norm": 0.357421875, "grad_norm_var": 0.0010892073313395181, "learning_rate": 0.0001, "loss": 1.4991, "loss/crossentropy": 2.612894654273987, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1866043657064438, "step": 7189 }, { "epoch": 0.10736230672171661, "grad_norm": 0.337890625, "grad_norm_var": 0.0010979811350504557, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.5797594785690308, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1738997846841812, "step": 7190 }, { "epoch": 0.10737723889233158, "grad_norm": 0.37890625, "grad_norm_var": 0.001105499267578125, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.772130012512207, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2043759599328041, "step": 7191 }, { "epoch": 0.10739217106294656, "grad_norm": 0.365234375, "grad_norm_var": 0.000849151611328125, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.703853487968445, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20272380113601685, "step": 7192 }, { "epoch": 0.10740710323356155, "grad_norm": 0.359375, "grad_norm_var": 0.0007214705149332682, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.4948956966400146, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20506145805120468, "step": 7193 }, { "epoch": 0.10742203540417652, "grad_norm": 0.359375, "grad_norm_var": 0.0006966749827067057, "learning_rate": 0.0001, "loss": 1.4078, "loss/crossentropy": 2.55290949344635, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1773621216416359, "step": 7194 }, { "epoch": 0.10743696757479151, "grad_norm": 0.515625, "grad_norm_var": 0.0024382909138997394, "learning_rate": 0.0001, "loss": 1.6509, "loss/crossentropy": 2.3962451219558716, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.23289035260677338, "step": 7195 }, { "epoch": 0.10745189974540649, "grad_norm": 0.33203125, "grad_norm_var": 0.0023025353749593098, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.7109997272491455, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18136823177337646, "step": 7196 }, { "epoch": 0.10746683191602147, "grad_norm": 0.3125, "grad_norm_var": 0.0024452050526936847, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.393248438835144, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1765645295381546, "step": 7197 }, { "epoch": 0.10748176408663646, "grad_norm": 0.34765625, "grad_norm_var": 0.002393070856730143, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.446060061454773, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2000514194369316, "step": 7198 }, { "epoch": 0.10749669625725143, "grad_norm": 0.388671875, "grad_norm_var": 0.0024010817209879558, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.5800750255584717, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.22009853273630142, "step": 7199 }, { "epoch": 0.10751162842786642, "grad_norm": 0.384765625, "grad_norm_var": 0.002236795425415039, "learning_rate": 0.0001, "loss": 1.5815, "loss/crossentropy": 2.5704315900802612, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.2338242307305336, "step": 7200 }, { "epoch": 0.1075265605984814, "grad_norm": 0.359375, "grad_norm_var": 0.002136993408203125, "learning_rate": 0.0001, "loss": 1.4074, "loss/crossentropy": 2.7633756399154663, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18082578480243683, "step": 7201 }, { "epoch": 0.10754149276909639, "grad_norm": 0.369140625, "grad_norm_var": 0.0021239598592122395, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.3825109004974365, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19382364302873611, "step": 7202 }, { "epoch": 0.10755642493971136, "grad_norm": 0.3671875, "grad_norm_var": 0.001913309097290039, "learning_rate": 0.0001, "loss": 1.6078, "loss/crossentropy": 2.7137064933776855, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.22887463122606277, "step": 7203 }, { "epoch": 0.10757135711032634, "grad_norm": 0.37109375, "grad_norm_var": 0.0019098917643229167, "learning_rate": 0.0001, "loss": 1.5466, "loss/crossentropy": 3.0342620611190796, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22233743965625763, "step": 7204 }, { "epoch": 0.10758628928094133, "grad_norm": 0.359375, "grad_norm_var": 0.0019070784250895183, "learning_rate": 0.0001, "loss": 1.5177, "loss/crossentropy": 2.3280199766159058, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.20126532018184662, "step": 7205 }, { "epoch": 0.1076012214515563, "grad_norm": 0.345703125, "grad_norm_var": 0.001878213882446289, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.3204976320266724, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.19805577397346497, "step": 7206 }, { "epoch": 0.10761615362217129, "grad_norm": 0.365234375, "grad_norm_var": 0.0018732070922851563, "learning_rate": 0.0001, "loss": 1.4071, "loss/crossentropy": 2.5825397968292236, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.1844889000058174, "step": 7207 }, { "epoch": 0.10763108579278627, "grad_norm": 0.3984375, "grad_norm_var": 0.0019258975982666016, "learning_rate": 0.0001, "loss": 1.8218, "loss/crossentropy": 2.4879103899002075, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.2865998223423958, "step": 7208 }, { "epoch": 0.10764601796340124, "grad_norm": 0.392578125, "grad_norm_var": 0.0019434611002604166, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.842274785041809, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17935192584991455, "step": 7209 }, { "epoch": 0.10766095013401623, "grad_norm": 0.3359375, "grad_norm_var": 0.0020205179850260415, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.6993895769119263, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18917150795459747, "step": 7210 }, { "epoch": 0.10767588230463121, "grad_norm": 0.294921875, "grad_norm_var": 0.0008261203765869141, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.414622187614441, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1787302941083908, "step": 7211 }, { "epoch": 0.1076908144752462, "grad_norm": 0.306640625, "grad_norm_var": 0.0009536107381184895, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.66467821598053, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.21390695869922638, "step": 7212 }, { "epoch": 0.10770574664586118, "grad_norm": 0.341796875, "grad_norm_var": 0.0008365472157796223, "learning_rate": 0.0001, "loss": 1.4181, "loss/crossentropy": 2.6015278100967407, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17977885901927948, "step": 7213 }, { "epoch": 0.10772067881647615, "grad_norm": 0.357421875, "grad_norm_var": 0.0008289972941080729, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.691476821899414, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.211740180850029, "step": 7214 }, { "epoch": 0.10773561098709114, "grad_norm": 0.349609375, "grad_norm_var": 0.0007679621378580729, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.637707233428955, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1683512032032013, "step": 7215 }, { "epoch": 0.10775054315770612, "grad_norm": 0.33984375, "grad_norm_var": 0.0007229963938395182, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.3019585609436035, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20583951473236084, "step": 7216 }, { "epoch": 0.1077654753283211, "grad_norm": 0.71484375, "grad_norm_var": 0.008903868993123372, "learning_rate": 0.0001, "loss": 1.9209, "loss/crossentropy": 3.1242555379867554, "loss/fcd": 1.62890625, "loss/idx": 10.0, "loss/logits": 0.2919708788394928, "step": 7217 }, { "epoch": 0.10778040749893608, "grad_norm": 0.33203125, "grad_norm_var": 0.009021949768066407, "learning_rate": 0.0001, "loss": 1.3693, "loss/crossentropy": 2.5370599031448364, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.15831658244132996, "step": 7218 }, { "epoch": 0.10779533966955106, "grad_norm": 0.326171875, "grad_norm_var": 0.009160470962524415, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.572067379951477, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1699615940451622, "step": 7219 }, { "epoch": 0.10781027184016605, "grad_norm": 0.408203125, "grad_norm_var": 0.00924835205078125, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.6318947076797485, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2211855724453926, "step": 7220 }, { "epoch": 0.10782520401078102, "grad_norm": 0.375, "grad_norm_var": 0.009235127766927084, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.69174861907959, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.20037713646888733, "step": 7221 }, { "epoch": 0.10784013618139601, "grad_norm": 0.421875, "grad_norm_var": 0.009310134251912435, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.6756619215011597, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19711843132972717, "step": 7222 }, { "epoch": 0.10785506835201099, "grad_norm": 0.703125, "grad_norm_var": 0.015835316975911458, "learning_rate": 0.0001, "loss": 1.5726, "loss/crossentropy": 3.403194785118103, "loss/fcd": 1.42578125, "loss/idx": 10.0, "loss/logits": 0.14682401716709137, "step": 7223 }, { "epoch": 0.10787000052262598, "grad_norm": 0.333984375, "grad_norm_var": 0.016107543309529623, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.6787986755371094, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.16602195799350739, "step": 7224 }, { "epoch": 0.10788493269324095, "grad_norm": 0.37890625, "grad_norm_var": 0.016125233968098958, "learning_rate": 0.0001, "loss": 1.6229, "loss/crossentropy": 2.6641011238098145, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21661901473999023, "step": 7225 }, { "epoch": 0.10789986486385593, "grad_norm": 0.359375, "grad_norm_var": 0.015974934895833334, "learning_rate": 0.0001, "loss": 1.3447, "loss/crossentropy": 2.6657084226608276, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.16891920566558838, "step": 7226 }, { "epoch": 0.10791479703447092, "grad_norm": 0.353515625, "grad_norm_var": 0.015396054585774739, "learning_rate": 0.0001, "loss": 1.501, "loss/crossentropy": 2.6410382986068726, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2041448950767517, "step": 7227 }, { "epoch": 0.1079297292050859, "grad_norm": 0.396484375, "grad_norm_var": 0.014780426025390625, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.88156259059906, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17916939407587051, "step": 7228 }, { "epoch": 0.10794466137570088, "grad_norm": 0.322265625, "grad_norm_var": 0.014970842997233074, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.6057077646255493, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18450195342302322, "step": 7229 }, { "epoch": 0.10795959354631586, "grad_norm": 0.373046875, "grad_norm_var": 0.01488793690999349, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.480318069458008, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18301472812891006, "step": 7230 }, { "epoch": 0.10797452571693084, "grad_norm": 0.341796875, "grad_norm_var": 0.014949989318847657, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.579197883605957, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1873602718114853, "step": 7231 }, { "epoch": 0.10798945788754583, "grad_norm": 0.34375, "grad_norm_var": 0.0149169921875, "learning_rate": 0.0001, "loss": 1.5252, "loss/crossentropy": 2.574245810508728, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20492149889469147, "step": 7232 }, { "epoch": 0.1080043900581608, "grad_norm": 0.3671875, "grad_norm_var": 0.008121172587076822, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.4919480085372925, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2141692042350769, "step": 7233 }, { "epoch": 0.10801932222877579, "grad_norm": 0.359375, "grad_norm_var": 0.007980092366536459, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.5014744997024536, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.20148421823978424, "step": 7234 }, { "epoch": 0.10803425439939077, "grad_norm": 0.326171875, "grad_norm_var": 0.007980092366536459, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.612009048461914, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16621330380439758, "step": 7235 }, { "epoch": 0.10804918657000574, "grad_norm": 0.36328125, "grad_norm_var": 0.007968759536743164, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.705093741416931, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1848660707473755, "step": 7236 }, { "epoch": 0.10806411874062073, "grad_norm": 0.42578125, "grad_norm_var": 0.00807951291402181, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.394696593284607, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.21329452097415924, "step": 7237 }, { "epoch": 0.10807905091123571, "grad_norm": 0.37890625, "grad_norm_var": 0.007987197240193684, "learning_rate": 0.0001, "loss": 1.6163, "loss/crossentropy": 2.390422224998474, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.229617677628994, "step": 7238 }, { "epoch": 0.1080939830818507, "grad_norm": 0.34765625, "grad_norm_var": 0.0007088820139567058, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.7627227306365967, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19777049124240875, "step": 7239 }, { "epoch": 0.10810891525246567, "grad_norm": 0.396484375, "grad_norm_var": 0.0007302443186442058, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.628973364830017, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.17875273525714874, "step": 7240 }, { "epoch": 0.10812384742308066, "grad_norm": 0.419921875, "grad_norm_var": 0.0009134928385416667, "learning_rate": 0.0001, "loss": 1.6365, "loss/crossentropy": 2.599604606628418, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.22242402285337448, "step": 7241 }, { "epoch": 0.10813877959369564, "grad_norm": 0.310546875, "grad_norm_var": 0.0011133670806884766, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.648081064224243, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.1699966862797737, "step": 7242 }, { "epoch": 0.10815371176431061, "grad_norm": 0.39453125, "grad_norm_var": 0.001160430908203125, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.624492049217224, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2125212699174881, "step": 7243 }, { "epoch": 0.1081686439349256, "grad_norm": 0.361328125, "grad_norm_var": 0.0010980606079101563, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.691373348236084, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20184536278247833, "step": 7244 }, { "epoch": 0.10818357610554058, "grad_norm": 0.353515625, "grad_norm_var": 0.0009831110636393228, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.566012382507324, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.215606689453125, "step": 7245 }, { "epoch": 0.10819850827615557, "grad_norm": 0.345703125, "grad_norm_var": 0.0010058085123697917, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.5549120903015137, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19845789670944214, "step": 7246 }, { "epoch": 0.10821344044677055, "grad_norm": 0.48046875, "grad_norm_var": 0.0017833550771077474, "learning_rate": 0.0001, "loss": 1.501, "loss/crossentropy": 2.7201818227767944, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19631268829107285, "step": 7247 }, { "epoch": 0.10822837261738552, "grad_norm": 0.41015625, "grad_norm_var": 0.0017963250478108723, "learning_rate": 0.0001, "loss": 1.8516, "loss/crossentropy": 2.371829628944397, "loss/fcd": 1.55859375, "loss/idx": 10.0, "loss/logits": 0.2930530309677124, "step": 7248 }, { "epoch": 0.10824330478800051, "grad_norm": 0.34765625, "grad_norm_var": 0.0018471876780192058, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.6611956357955933, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18898579478263855, "step": 7249 }, { "epoch": 0.10825823695861549, "grad_norm": 0.31640625, "grad_norm_var": 0.002059793472290039, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.6667356491088867, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1771707385778427, "step": 7250 }, { "epoch": 0.10827316912923048, "grad_norm": 0.421875, "grad_norm_var": 0.0020263036092122394, "learning_rate": 0.0001, "loss": 1.7543, "loss/crossentropy": 2.129779815673828, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.24257254600524902, "step": 7251 }, { "epoch": 0.10828810129984545, "grad_norm": 0.33203125, "grad_norm_var": 0.002155494689941406, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.5894051790237427, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17656850814819336, "step": 7252 }, { "epoch": 0.10830303347046043, "grad_norm": 0.30859375, "grad_norm_var": 0.002262306213378906, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.710377812385559, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18723227083683014, "step": 7253 }, { "epoch": 0.10831796564107542, "grad_norm": 0.330078125, "grad_norm_var": 0.0023556868235270183, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.6047972440719604, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18961767107248306, "step": 7254 }, { "epoch": 0.10833289781169039, "grad_norm": 0.375, "grad_norm_var": 0.0023307641347249347, "learning_rate": 0.0001, "loss": 1.4403, "loss/crossentropy": 2.547015905380249, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.182533860206604, "step": 7255 }, { "epoch": 0.10834782998230538, "grad_norm": 0.33984375, "grad_norm_var": 0.0023238499959309894, "learning_rate": 0.0001, "loss": 1.4513, "loss/crossentropy": 2.5841996669769287, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19343814253807068, "step": 7256 }, { "epoch": 0.10836276215292036, "grad_norm": 0.31640625, "grad_norm_var": 0.0022421360015869142, "learning_rate": 0.0001, "loss": 1.4355, "loss/crossentropy": 2.3994020223617554, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1777331382036209, "step": 7257 }, { "epoch": 0.10837769432353533, "grad_norm": 0.380859375, "grad_norm_var": 0.0020967960357666016, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.3822484016418457, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16019974648952484, "step": 7258 }, { "epoch": 0.10839262649415032, "grad_norm": 0.41015625, "grad_norm_var": 0.0021769046783447266, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.729619264602661, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.2519430071115494, "step": 7259 }, { "epoch": 0.1084075586647653, "grad_norm": 0.3359375, "grad_norm_var": 0.002227528889973958, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.703024744987488, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17864388972520828, "step": 7260 }, { "epoch": 0.10842249083538029, "grad_norm": 0.36328125, "grad_norm_var": 0.002221409479777018, "learning_rate": 0.0001, "loss": 1.4015, "loss/crossentropy": 2.505999445915222, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.16709938645362854, "step": 7261 }, { "epoch": 0.10843742300599526, "grad_norm": 0.31640625, "grad_norm_var": 0.002344195048014323, "learning_rate": 0.0001, "loss": 1.4173, "loss/crossentropy": 2.462623119354248, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1711849942803383, "step": 7262 }, { "epoch": 0.10845235517661025, "grad_norm": 0.37109375, "grad_norm_var": 0.0013579686482747396, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.6176135540008545, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19088785350322723, "step": 7263 }, { "epoch": 0.10846728734722523, "grad_norm": 0.33203125, "grad_norm_var": 0.0011621475219726562, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.5412851572036743, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.21536307036876678, "step": 7264 }, { "epoch": 0.1084822195178402, "grad_norm": 0.4765625, "grad_norm_var": 0.002162933349609375, "learning_rate": 0.0001, "loss": 1.6353, "loss/crossentropy": 2.5727462768554688, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.24860476702451706, "step": 7265 }, { "epoch": 0.1084971516884552, "grad_norm": 0.32421875, "grad_norm_var": 0.0021235148111979165, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.487787365913391, "loss/fcd": 1.171875, "loss/idx": 10.0, "loss/logits": 0.15682365745306015, "step": 7266 }, { "epoch": 0.10851208385907017, "grad_norm": 0.353515625, "grad_norm_var": 0.001837015151977539, "learning_rate": 0.0001, "loss": 1.5108, "loss/crossentropy": 2.6393308639526367, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.20613282918930054, "step": 7267 }, { "epoch": 0.10852701602968516, "grad_norm": 0.306640625, "grad_norm_var": 0.0019521077473958334, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.6164921522140503, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17126142233610153, "step": 7268 }, { "epoch": 0.10854194820030014, "grad_norm": 0.33984375, "grad_norm_var": 0.0018300374348958334, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.6639078855514526, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21034622937440872, "step": 7269 }, { "epoch": 0.10855688037091511, "grad_norm": 0.365234375, "grad_norm_var": 0.0017928441365559896, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.5960110425949097, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2055470049381256, "step": 7270 }, { "epoch": 0.1085718125415301, "grad_norm": 0.3203125, "grad_norm_var": 0.0018462498982747396, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.443670153617859, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17233717441558838, "step": 7271 }, { "epoch": 0.10858674471214508, "grad_norm": 0.392578125, "grad_norm_var": 0.00192564328511556, "learning_rate": 0.0001, "loss": 1.3742, "loss/crossentropy": 2.7648638486862183, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.16715078800916672, "step": 7272 }, { "epoch": 0.10860167688276007, "grad_norm": 0.32421875, "grad_norm_var": 0.0018876234690348307, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.620391845703125, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21956294029951096, "step": 7273 }, { "epoch": 0.10861660905337504, "grad_norm": 0.30859375, "grad_norm_var": 0.0019846598307291666, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.5356441736221313, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.21384628117084503, "step": 7274 }, { "epoch": 0.10863154122399002, "grad_norm": 0.373046875, "grad_norm_var": 0.0017856438954671225, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.7233290672302246, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18628539890050888, "step": 7275 }, { "epoch": 0.10864647339460501, "grad_norm": 0.419921875, "grad_norm_var": 0.0020665486653645834, "learning_rate": 0.0001, "loss": 1.5431, "loss/crossentropy": 2.492472290992737, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.22280213236808777, "step": 7276 }, { "epoch": 0.10866140556521998, "grad_norm": 0.375, "grad_norm_var": 0.0020873387654622394, "learning_rate": 0.0001, "loss": 1.6686, "loss/crossentropy": 2.3935736417770386, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.23503738641738892, "step": 7277 }, { "epoch": 0.10867633773583497, "grad_norm": 0.36328125, "grad_norm_var": 0.0019759496053059896, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.770113229751587, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.23895461112260818, "step": 7278 }, { "epoch": 0.10869126990644995, "grad_norm": 0.345703125, "grad_norm_var": 0.0019757429758707683, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.650293469429016, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2000560313463211, "step": 7279 }, { "epoch": 0.10870620207706493, "grad_norm": 0.427734375, "grad_norm_var": 0.002222633361816406, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.7200690507888794, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.18080949783325195, "step": 7280 }, { "epoch": 0.10872113424767992, "grad_norm": 0.333984375, "grad_norm_var": 0.0013442834218343098, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.6137558221817017, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20020191371440887, "step": 7281 }, { "epoch": 0.10873606641829489, "grad_norm": 0.369140625, "grad_norm_var": 0.0012883504231770834, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.519987106323242, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19032981246709824, "step": 7282 }, { "epoch": 0.10875099858890988, "grad_norm": 0.365234375, "grad_norm_var": 0.0012908299763997396, "learning_rate": 0.0001, "loss": 1.5502, "loss/crossentropy": 2.6439192295074463, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21036673337221146, "step": 7283 }, { "epoch": 0.10876593075952486, "grad_norm": 0.333984375, "grad_norm_var": 0.001149749755859375, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.6936150789260864, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20292150229215622, "step": 7284 }, { "epoch": 0.10878086293013985, "grad_norm": 0.314453125, "grad_norm_var": 0.0012578169504801433, "learning_rate": 0.0001, "loss": 1.4657, "loss/crossentropy": 2.5468661785125732, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.2001110538840294, "step": 7285 }, { "epoch": 0.10879579510075482, "grad_norm": 0.341796875, "grad_norm_var": 0.0012704054514567056, "learning_rate": 0.0001, "loss": 1.5002, "loss/crossentropy": 2.7200061082839966, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19548510015010834, "step": 7286 }, { "epoch": 0.1088107272713698, "grad_norm": 0.328125, "grad_norm_var": 0.0012362003326416016, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.6087169647216797, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18105941265821457, "step": 7287 }, { "epoch": 0.10882565944198479, "grad_norm": 0.33203125, "grad_norm_var": 0.0011805216471354166, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.6261043548583984, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.20503799617290497, "step": 7288 }, { "epoch": 0.10884059161259976, "grad_norm": 0.361328125, "grad_norm_var": 0.0011216322580973306, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.5522522926330566, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20378752052783966, "step": 7289 }, { "epoch": 0.10885552378321475, "grad_norm": 0.365234375, "grad_norm_var": 0.0009653727213541667, "learning_rate": 0.0001, "loss": 1.4984, "loss/crossentropy": 2.381470203399658, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.18594016879796982, "step": 7290 }, { "epoch": 0.10887045595382973, "grad_norm": 0.341796875, "grad_norm_var": 0.0009694417317708333, "learning_rate": 0.0001, "loss": 1.444, "loss/crossentropy": 2.8612009286880493, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18621908128261566, "step": 7291 }, { "epoch": 0.1088853881244447, "grad_norm": 0.31640625, "grad_norm_var": 0.0007765293121337891, "learning_rate": 0.0001, "loss": 1.4437, "loss/crossentropy": 2.628493905067444, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.20933088660240173, "step": 7292 }, { "epoch": 0.1089003202950597, "grad_norm": 0.408203125, "grad_norm_var": 0.0009518941243489583, "learning_rate": 0.0001, "loss": 1.492, "loss/crossentropy": 2.6407047510147095, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20289724320173264, "step": 7293 }, { "epoch": 0.10891525246567467, "grad_norm": 0.3828125, "grad_norm_var": 0.0010024388631184897, "learning_rate": 0.0001, "loss": 1.495, "loss/crossentropy": 2.6335291862487793, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.21768057346343994, "step": 7294 }, { "epoch": 0.10893018463628966, "grad_norm": 0.310546875, "grad_norm_var": 0.0011197408040364584, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.7151646614074707, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17997469007968903, "step": 7295 }, { "epoch": 0.10894511680690463, "grad_norm": 0.345703125, "grad_norm_var": 0.0007125218709309896, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.29997056722641, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.24477045238018036, "step": 7296 }, { "epoch": 0.10896004897751961, "grad_norm": 0.353515625, "grad_norm_var": 0.000702667236328125, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.668389320373535, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18499404937028885, "step": 7297 }, { "epoch": 0.1089749811481346, "grad_norm": 0.40625, "grad_norm_var": 0.0008926232655843099, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.6468148231506348, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21607131510972977, "step": 7298 }, { "epoch": 0.10898991331874958, "grad_norm": 0.427734375, "grad_norm_var": 0.0012598514556884765, "learning_rate": 0.0001, "loss": 1.6749, "loss/crossentropy": 2.5097485780715942, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.23739054054021835, "step": 7299 }, { "epoch": 0.10900484548936457, "grad_norm": 0.341796875, "grad_norm_var": 0.0012424310048421224, "learning_rate": 0.0001, "loss": 1.6138, "loss/crossentropy": 2.5916998386383057, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2309522181749344, "step": 7300 }, { "epoch": 0.10901977765997954, "grad_norm": 0.359375, "grad_norm_var": 0.001126543680826823, "learning_rate": 0.0001, "loss": 1.5272, "loss/crossentropy": 2.7185709476470947, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20302119106054306, "step": 7301 }, { "epoch": 0.10903470983059453, "grad_norm": 0.37890625, "grad_norm_var": 0.0011340936024983725, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.6915677785873413, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20901218056678772, "step": 7302 }, { "epoch": 0.1090496420012095, "grad_norm": 0.35546875, "grad_norm_var": 0.0010646661122639975, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.606805443763733, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20737537741661072, "step": 7303 }, { "epoch": 0.10906457417182448, "grad_norm": 0.35546875, "grad_norm_var": 0.001006301244099935, "learning_rate": 0.0001, "loss": 1.5755, "loss/crossentropy": 2.1442739367485046, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20828627049922943, "step": 7304 }, { "epoch": 0.10907950634243947, "grad_norm": 0.291015625, "grad_norm_var": 0.0013324578603108725, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.5350120067596436, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.19072973728179932, "step": 7305 }, { "epoch": 0.10909443851305445, "grad_norm": 0.322265625, "grad_norm_var": 0.0014107863108317057, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.525503396987915, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19222845137119293, "step": 7306 }, { "epoch": 0.10910937068366944, "grad_norm": 0.3515625, "grad_norm_var": 0.0013981501261393229, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.445563316345215, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.17866557091474533, "step": 7307 }, { "epoch": 0.10912430285428441, "grad_norm": 0.349609375, "grad_norm_var": 0.0012887159983317057, "learning_rate": 0.0001, "loss": 1.5935, "loss/crossentropy": 2.4190375804901123, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.23409351706504822, "step": 7308 }, { "epoch": 0.10913923502489939, "grad_norm": 0.375, "grad_norm_var": 0.0011387507120768229, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.5749794244766235, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18895110487937927, "step": 7309 }, { "epoch": 0.10915416719551438, "grad_norm": 0.31640625, "grad_norm_var": 0.0011830647786458333, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.437833547592163, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.1775362268090248, "step": 7310 }, { "epoch": 0.10916909936612935, "grad_norm": 0.341796875, "grad_norm_var": 0.0010691324869791666, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.575536608695984, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21865150332450867, "step": 7311 }, { "epoch": 0.10918403153674434, "grad_norm": 0.326171875, "grad_norm_var": 0.0011158625284830729, "learning_rate": 0.0001, "loss": 1.3789, "loss/crossentropy": 2.6069682836532593, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.16794230788946152, "step": 7312 }, { "epoch": 0.10919896370735932, "grad_norm": 0.3203125, "grad_norm_var": 0.0011836846669514975, "learning_rate": 0.0001, "loss": 1.4356, "loss/crossentropy": 2.2871906757354736, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1934218406677246, "step": 7313 }, { "epoch": 0.1092138958779743, "grad_norm": 0.5859375, "grad_norm_var": 0.0045206546783447266, "learning_rate": 0.0001, "loss": 1.9142, "loss/crossentropy": 2.5213987827301025, "loss/fcd": 1.5546875, "loss/idx": 10.0, "loss/logits": 0.359489768743515, "step": 7314 }, { "epoch": 0.10922882804858929, "grad_norm": 0.3203125, "grad_norm_var": 0.004306475321451823, "learning_rate": 0.0001, "loss": 1.3551, "loss/crossentropy": 2.685714840888977, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.16762170940637589, "step": 7315 }, { "epoch": 0.10924376021920426, "grad_norm": 0.3671875, "grad_norm_var": 0.004299656550089518, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.389867901802063, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.1969531774520874, "step": 7316 }, { "epoch": 0.10925869238981925, "grad_norm": 0.484375, "grad_norm_var": 0.005310805638631185, "learning_rate": 0.0001, "loss": 1.6164, "loss/crossentropy": 2.540328860282898, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.22182993590831757, "step": 7317 }, { "epoch": 0.10927362456043423, "grad_norm": 0.3671875, "grad_norm_var": 0.00529783566792806, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.5827348232269287, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2053593397140503, "step": 7318 }, { "epoch": 0.1092885567310492, "grad_norm": 0.337890625, "grad_norm_var": 0.005338033040364583, "learning_rate": 0.0001, "loss": 1.5478, "loss/crossentropy": 2.495978832244873, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.1884167417883873, "step": 7319 }, { "epoch": 0.10930348890166419, "grad_norm": 0.330078125, "grad_norm_var": 0.005404774347941081, "learning_rate": 0.0001, "loss": 1.4397, "loss/crossentropy": 2.51314914226532, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18576855957508087, "step": 7320 }, { "epoch": 0.10931842107227917, "grad_norm": 0.373046875, "grad_norm_var": 0.00505229632059733, "learning_rate": 0.0001, "loss": 1.5746, "loss/crossentropy": 2.6438536643981934, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.21136623620986938, "step": 7321 }, { "epoch": 0.10933335324289416, "grad_norm": 0.330078125, "grad_norm_var": 0.005009698867797852, "learning_rate": 0.0001, "loss": 1.6101, "loss/crossentropy": 2.276650309562683, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2507288455963135, "step": 7322 }, { "epoch": 0.10934828541350913, "grad_norm": 0.392578125, "grad_norm_var": 0.005028724670410156, "learning_rate": 0.0001, "loss": 1.5646, "loss/crossentropy": 2.667941689491272, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22473537921905518, "step": 7323 }, { "epoch": 0.10936321758412412, "grad_norm": 0.369140625, "grad_norm_var": 0.004999796549479167, "learning_rate": 0.0001, "loss": 1.4132, "loss/crossentropy": 2.601942539215088, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1710394024848938, "step": 7324 }, { "epoch": 0.1093781497547391, "grad_norm": 0.330078125, "grad_norm_var": 0.00510252316792806, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.4759676456451416, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17907893657684326, "step": 7325 }, { "epoch": 0.10939308192535407, "grad_norm": 0.326171875, "grad_norm_var": 0.005040931701660156, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.5654085874557495, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16685759276151657, "step": 7326 }, { "epoch": 0.10940801409596906, "grad_norm": 0.33203125, "grad_norm_var": 0.005082178115844727, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.6985597610473633, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19894107431173325, "step": 7327 }, { "epoch": 0.10942294626658404, "grad_norm": 0.337890625, "grad_norm_var": 0.005024957656860352, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.7455588579177856, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17223718762397766, "step": 7328 }, { "epoch": 0.10943787843719903, "grad_norm": 0.380859375, "grad_norm_var": 0.004860877990722656, "learning_rate": 0.0001, "loss": 1.5104, "loss/crossentropy": 2.634095549583435, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19004026800394058, "step": 7329 }, { "epoch": 0.109452810607814, "grad_norm": 0.34765625, "grad_norm_var": 0.001638031005859375, "learning_rate": 0.0001, "loss": 1.5195, "loss/crossentropy": 2.5499669313430786, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19922727346420288, "step": 7330 }, { "epoch": 0.10946774277842898, "grad_norm": 0.333984375, "grad_norm_var": 0.0015811761220296224, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.7881855964660645, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18891648948192596, "step": 7331 }, { "epoch": 0.10948267494904397, "grad_norm": 0.380859375, "grad_norm_var": 0.0016082127888997395, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.751067042350769, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.18994608521461487, "step": 7332 }, { "epoch": 0.10949760711965895, "grad_norm": 0.365234375, "grad_norm_var": 0.0005135695139567057, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.627852201461792, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22159802913665771, "step": 7333 }, { "epoch": 0.10951253929027394, "grad_norm": 0.50390625, "grad_norm_var": 0.001955525080362956, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.65875780582428, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19586686789989471, "step": 7334 }, { "epoch": 0.10952747146088891, "grad_norm": 0.341796875, "grad_norm_var": 0.001944589614868164, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.5033644437789917, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.17424336820840836, "step": 7335 }, { "epoch": 0.10954240363150389, "grad_norm": 0.52734375, "grad_norm_var": 0.0035643895467122396, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.672670006752014, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17316222190856934, "step": 7336 }, { "epoch": 0.10955733580211888, "grad_norm": 0.419921875, "grad_norm_var": 0.0037001927693684896, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.7254762649536133, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18705594539642334, "step": 7337 }, { "epoch": 0.10957226797273385, "grad_norm": 0.36328125, "grad_norm_var": 0.003564818700154622, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.6412750482559204, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.205941304564476, "step": 7338 }, { "epoch": 0.10958720014334884, "grad_norm": 0.365234375, "grad_norm_var": 0.0035594781239827473, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.5415639877319336, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17540370672941208, "step": 7339 }, { "epoch": 0.10960213231396382, "grad_norm": 0.39453125, "grad_norm_var": 0.003574562072753906, "learning_rate": 0.0001, "loss": 1.4555, "loss/crossentropy": 2.6082428693771362, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.17426753044128418, "step": 7340 }, { "epoch": 0.1096170644845788, "grad_norm": 0.466796875, "grad_norm_var": 0.0038660685221354168, "learning_rate": 0.0001, "loss": 1.7072, "loss/crossentropy": 2.229456305503845, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.21891053020954132, "step": 7341 }, { "epoch": 0.10963199665519378, "grad_norm": 0.3203125, "grad_norm_var": 0.00391551653544108, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.438105583190918, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18257876485586166, "step": 7342 }, { "epoch": 0.10964692882580876, "grad_norm": 0.369140625, "grad_norm_var": 0.0037328084309895832, "learning_rate": 0.0001, "loss": 1.413, "loss/crossentropy": 2.6330186128616333, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17476347088813782, "step": 7343 }, { "epoch": 0.10966186099642375, "grad_norm": 0.361328125, "grad_norm_var": 0.003608449300130208, "learning_rate": 0.0001, "loss": 1.4821, "loss/crossentropy": 2.6179083585739136, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20475506782531738, "step": 7344 }, { "epoch": 0.10967679316703872, "grad_norm": 0.55859375, "grad_norm_var": 0.005362939834594726, "learning_rate": 0.0001, "loss": 1.5909, "loss/crossentropy": 2.208295464515686, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.17292196303606033, "step": 7345 }, { "epoch": 0.10969172533765371, "grad_norm": 0.412109375, "grad_norm_var": 0.00516204833984375, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.3264832496643066, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20347699522972107, "step": 7346 }, { "epoch": 0.10970665750826869, "grad_norm": 0.322265625, "grad_norm_var": 0.005282020568847657, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.415986180305481, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.2058693841099739, "step": 7347 }, { "epoch": 0.10972158967888367, "grad_norm": 0.39453125, "grad_norm_var": 0.00525053342183431, "learning_rate": 0.0001, "loss": 1.569, "loss/crossentropy": 2.5489771366119385, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22133202850818634, "step": 7348 }, { "epoch": 0.10973652184949866, "grad_norm": 0.34375, "grad_norm_var": 0.0053944269816080725, "learning_rate": 0.0001, "loss": 1.6662, "loss/crossentropy": 2.246616244316101, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.26387814432382584, "step": 7349 }, { "epoch": 0.10975145402011363, "grad_norm": 0.3515625, "grad_norm_var": 0.004816691080729167, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.637412190437317, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1724644899368286, "step": 7350 }, { "epoch": 0.10976638619072862, "grad_norm": 0.31640625, "grad_norm_var": 0.005035511652628581, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.403507351875305, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.20456264913082123, "step": 7351 }, { "epoch": 0.1097813183613436, "grad_norm": 0.337890625, "grad_norm_var": 0.003883806864420573, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.4976227283477783, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.25945983827114105, "step": 7352 }, { "epoch": 0.10979625053195857, "grad_norm": 0.318359375, "grad_norm_var": 0.004002825419108073, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.6943764686584473, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.18448776751756668, "step": 7353 }, { "epoch": 0.10981118270257356, "grad_norm": 0.4140625, "grad_norm_var": 0.0040863037109375, "learning_rate": 0.0001, "loss": 1.5278, "loss/crossentropy": 2.505493640899658, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.19965838640928268, "step": 7354 }, { "epoch": 0.10982611487318854, "grad_norm": 0.375, "grad_norm_var": 0.00407573382059733, "learning_rate": 0.0001, "loss": 1.5761, "loss/crossentropy": 2.5883405208587646, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.2362387701869011, "step": 7355 }, { "epoch": 0.10984104704380353, "grad_norm": 0.33984375, "grad_norm_var": 0.004146051406860351, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.488842248916626, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1896279901266098, "step": 7356 }, { "epoch": 0.1098559792144185, "grad_norm": 0.353515625, "grad_norm_var": 0.0035634199778238934, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.5328770875930786, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2176249772310257, "step": 7357 }, { "epoch": 0.10987091138503348, "grad_norm": 0.380859375, "grad_norm_var": 0.0034072240193684894, "learning_rate": 0.0001, "loss": 1.7122, "loss/crossentropy": 2.535878896713257, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.25125204771757126, "step": 7358 }, { "epoch": 0.10988584355564847, "grad_norm": 0.34765625, "grad_norm_var": 0.0034437656402587892, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.795903444290161, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.16985952109098434, "step": 7359 }, { "epoch": 0.10990077572626344, "grad_norm": 0.359375, "grad_norm_var": 0.0034463882446289064, "learning_rate": 0.0001, "loss": 1.4367, "loss/crossentropy": 2.632167935371399, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19063936173915863, "step": 7360 }, { "epoch": 0.10991570789687843, "grad_norm": 0.380859375, "grad_norm_var": 0.0009600162506103515, "learning_rate": 0.0001, "loss": 1.5395, "loss/crossentropy": 2.5657986402511597, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21139442175626755, "step": 7361 }, { "epoch": 0.10993064006749341, "grad_norm": 0.376953125, "grad_norm_var": 0.0007894992828369141, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.42866313457489, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18110834807157516, "step": 7362 }, { "epoch": 0.1099455722381084, "grad_norm": 0.388671875, "grad_norm_var": 0.0007570743560791016, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.6706180572509766, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18747301399707794, "step": 7363 }, { "epoch": 0.10996050440872338, "grad_norm": 0.330078125, "grad_norm_var": 0.0007303237915039062, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.5303670167922974, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18644242733716965, "step": 7364 }, { "epoch": 0.10997543657933835, "grad_norm": 0.4140625, "grad_norm_var": 0.0009134292602539062, "learning_rate": 0.0001, "loss": 1.4412, "loss/crossentropy": 2.843212842941284, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19505934417247772, "step": 7365 }, { "epoch": 0.10999036874995334, "grad_norm": 0.326171875, "grad_norm_var": 0.0009876092274983723, "learning_rate": 0.0001, "loss": 1.5305, "loss/crossentropy": 2.551764726638794, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20624516904354095, "step": 7366 }, { "epoch": 0.11000530092056832, "grad_norm": 0.431640625, "grad_norm_var": 0.0011479695638020833, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.537406325340271, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1645541414618492, "step": 7367 }, { "epoch": 0.1100202330911833, "grad_norm": 0.33203125, "grad_norm_var": 0.0011730035146077474, "learning_rate": 0.0001, "loss": 1.4842, "loss/crossentropy": 2.733101963996887, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20687229931354523, "step": 7368 }, { "epoch": 0.11003516526179828, "grad_norm": 0.369140625, "grad_norm_var": 0.0010060469309488933, "learning_rate": 0.0001, "loss": 1.7135, "loss/crossentropy": 2.826459050178528, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.2838188409805298, "step": 7369 }, { "epoch": 0.11005009743241326, "grad_norm": 0.361328125, "grad_norm_var": 0.0008700052897135417, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.479138731956482, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.21290503442287445, "step": 7370 }, { "epoch": 0.11006502960302825, "grad_norm": 0.51953125, "grad_norm_var": 0.0023355484008789062, "learning_rate": 0.0001, "loss": 1.5229, "loss/crossentropy": 2.9373855590820312, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21037257462739944, "step": 7371 }, { "epoch": 0.11007996177364322, "grad_norm": 0.37109375, "grad_norm_var": 0.002247047424316406, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.558556079864502, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.18717901408672333, "step": 7372 }, { "epoch": 0.11009489394425821, "grad_norm": 0.71484375, "grad_norm_var": 0.009242486953735352, "learning_rate": 0.0001, "loss": 2.1848, "loss/crossentropy": 2.7204372882843018, "loss/fcd": 1.74609375, "loss/idx": 10.0, "loss/logits": 0.4386947751045227, "step": 7373 }, { "epoch": 0.11010982611487319, "grad_norm": 0.470703125, "grad_norm_var": 0.009514474868774414, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.2918756008148193, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.14952585846185684, "step": 7374 }, { "epoch": 0.11012475828548816, "grad_norm": 0.36328125, "grad_norm_var": 0.00940842628479004, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.5834139585494995, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19381187111139297, "step": 7375 }, { "epoch": 0.11013969045610315, "grad_norm": 0.37109375, "grad_norm_var": 0.009342813491821289, "learning_rate": 0.0001, "loss": 1.5227, "loss/crossentropy": 2.6518248319625854, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20237517356872559, "step": 7376 }, { "epoch": 0.11015462262671813, "grad_norm": 0.337890625, "grad_norm_var": 0.009611368179321289, "learning_rate": 0.0001, "loss": 1.3809, "loss/crossentropy": 2.663572072982788, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1699737310409546, "step": 7377 }, { "epoch": 0.11016955479733312, "grad_norm": 0.384765625, "grad_norm_var": 0.009586064020792644, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.799098253250122, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2072586566209793, "step": 7378 }, { "epoch": 0.1101844869679481, "grad_norm": 0.375, "grad_norm_var": 0.009628232320149739, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.5683436393737793, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1802075356245041, "step": 7379 }, { "epoch": 0.11019941913856307, "grad_norm": 0.330078125, "grad_norm_var": 0.009628232320149739, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.635204792022705, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18593701720237732, "step": 7380 }, { "epoch": 0.11021435130917806, "grad_norm": 0.328125, "grad_norm_var": 0.009980710347493489, "learning_rate": 0.0001, "loss": 1.4341, "loss/crossentropy": 2.675452947616577, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19192321598529816, "step": 7381 }, { "epoch": 0.11022928347979304, "grad_norm": 0.369140625, "grad_norm_var": 0.009677886962890625, "learning_rate": 0.0001, "loss": 1.5025, "loss/crossentropy": 2.454861044883728, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19785837084054947, "step": 7382 }, { "epoch": 0.11024421565040803, "grad_norm": 0.345703125, "grad_norm_var": 0.009798177083333333, "learning_rate": 0.0001, "loss": 1.4624, "loss/crossentropy": 2.591955304145813, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20066194981336594, "step": 7383 }, { "epoch": 0.110259147821023, "grad_norm": 0.41796875, "grad_norm_var": 0.009521230061848959, "learning_rate": 0.0001, "loss": 1.6268, "loss/crossentropy": 2.528507947921753, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.22055334597826004, "step": 7384 }, { "epoch": 0.11027407999163799, "grad_norm": 0.388671875, "grad_norm_var": 0.009459877014160156, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.5121350288391113, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17127500474452972, "step": 7385 }, { "epoch": 0.11028901216225297, "grad_norm": 0.388671875, "grad_norm_var": 0.009354400634765624, "learning_rate": 0.0001, "loss": 1.6871, "loss/crossentropy": 2.336395740509033, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.2300499603152275, "step": 7386 }, { "epoch": 0.11030394433286794, "grad_norm": 0.39453125, "grad_norm_var": 0.008418528238932292, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.560030460357666, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18768159300088882, "step": 7387 }, { "epoch": 0.11031887650348293, "grad_norm": 0.4375, "grad_norm_var": 0.008465003967285157, "learning_rate": 0.0001, "loss": 1.5223, "loss/crossentropy": 2.5799484252929688, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20198260247707367, "step": 7388 }, { "epoch": 0.11033380867409791, "grad_norm": 0.306640625, "grad_norm_var": 0.0018044630686442056, "learning_rate": 0.0001, "loss": 1.3753, "loss/crossentropy": 2.730239748954773, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1761023998260498, "step": 7389 }, { "epoch": 0.1103487408447129, "grad_norm": 0.40625, "grad_norm_var": 0.0012468973795572917, "learning_rate": 0.0001, "loss": 1.7308, "loss/crossentropy": 2.390581965446472, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.29331328719854355, "step": 7390 }, { "epoch": 0.11036367301532787, "grad_norm": 0.39453125, "grad_norm_var": 0.001273345947265625, "learning_rate": 0.0001, "loss": 1.6186, "loss/crossentropy": 2.552634358406067, "loss/fcd": 1.3984375, "loss/idx": 10.0, "loss/logits": 0.22019022703170776, "step": 7391 }, { "epoch": 0.11037860518594285, "grad_norm": 0.3828125, "grad_norm_var": 0.0012781143188476563, "learning_rate": 0.0001, "loss": 1.5027, "loss/crossentropy": 2.4845407009124756, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.1979745775461197, "step": 7392 }, { "epoch": 0.11039353735655784, "grad_norm": 0.337890625, "grad_norm_var": 0.0012781143188476563, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.6580289602279663, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20696311444044113, "step": 7393 }, { "epoch": 0.11040846952717281, "grad_norm": 0.333984375, "grad_norm_var": 0.0013682047526041667, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.515838623046875, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18725335597991943, "step": 7394 }, { "epoch": 0.1104234016977878, "grad_norm": 0.337890625, "grad_norm_var": 0.0014349460601806641, "learning_rate": 0.0001, "loss": 1.5822, "loss/crossentropy": 2.583065390586853, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22672590613365173, "step": 7395 }, { "epoch": 0.11043833386840278, "grad_norm": 0.3046875, "grad_norm_var": 0.0016062418619791667, "learning_rate": 0.0001, "loss": 1.2371, "loss/crossentropy": 2.4047064781188965, "loss/fcd": 1.1015625, "loss/idx": 10.0, "loss/logits": 0.13548965752124786, "step": 7396 }, { "epoch": 0.11045326603901776, "grad_norm": 0.330078125, "grad_norm_var": 0.0015963077545166015, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.463638663291931, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.1793256774544716, "step": 7397 }, { "epoch": 0.11046819820963275, "grad_norm": 0.380859375, "grad_norm_var": 0.0016077518463134765, "learning_rate": 0.0001, "loss": 1.3844, "loss/crossentropy": 2.565721035003662, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.16958583146333694, "step": 7398 }, { "epoch": 0.11048313038024772, "grad_norm": 0.314453125, "grad_norm_var": 0.0017618656158447266, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.505321979522705, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18745353817939758, "step": 7399 }, { "epoch": 0.11049806255086271, "grad_norm": 0.341796875, "grad_norm_var": 0.00159759521484375, "learning_rate": 0.0001, "loss": 1.5764, "loss/crossentropy": 2.5420037508010864, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21700576692819595, "step": 7400 }, { "epoch": 0.11051299472147769, "grad_norm": 0.5, "grad_norm_var": 0.0027781009674072267, "learning_rate": 0.0001, "loss": 1.7453, "loss/crossentropy": 2.3250783681869507, "loss/fcd": 1.5078125, "loss/idx": 10.0, "loss/logits": 0.2374676987528801, "step": 7401 }, { "epoch": 0.11052792689209266, "grad_norm": 0.349609375, "grad_norm_var": 0.0027672926584879558, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.7833805084228516, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20655188709497452, "step": 7402 }, { "epoch": 0.11054285906270765, "grad_norm": 0.3515625, "grad_norm_var": 0.0027183373769124348, "learning_rate": 0.0001, "loss": 1.5623, "loss/crossentropy": 2.5094743967056274, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21076826751232147, "step": 7403 }, { "epoch": 0.11055779123332263, "grad_norm": 0.357421875, "grad_norm_var": 0.0023253758748372397, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.7767207622528076, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.2116755098104477, "step": 7404 }, { "epoch": 0.11057272340393762, "grad_norm": 0.333984375, "grad_norm_var": 0.002184295654296875, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.7956812381744385, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.16725628077983856, "step": 7405 }, { "epoch": 0.11058765557455259, "grad_norm": 0.36328125, "grad_norm_var": 0.0020339330037434894, "learning_rate": 0.0001, "loss": 1.59, "loss/crossentropy": 2.4699913263320923, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.22676844894886017, "step": 7406 }, { "epoch": 0.11060258774516758, "grad_norm": 0.4375, "grad_norm_var": 0.0023633321126302082, "learning_rate": 0.0001, "loss": 1.7398, "loss/crossentropy": 2.487750291824341, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.2593507915735245, "step": 7407 }, { "epoch": 0.11061751991578256, "grad_norm": 0.42578125, "grad_norm_var": 0.0026102066040039062, "learning_rate": 0.0001, "loss": 1.7632, "loss/crossentropy": 2.448109269142151, "loss/fcd": 1.51953125, "loss/idx": 10.0, "loss/logits": 0.2436235100030899, "step": 7408 }, { "epoch": 0.11063245208639753, "grad_norm": 0.326171875, "grad_norm_var": 0.002657318115234375, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.6901594400405884, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18704526126384735, "step": 7409 }, { "epoch": 0.11064738425701252, "grad_norm": 0.384765625, "grad_norm_var": 0.0026300430297851564, "learning_rate": 0.0001, "loss": 1.607, "loss/crossentropy": 2.5413633584976196, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21636385470628738, "step": 7410 }, { "epoch": 0.1106623164276275, "grad_norm": 0.353515625, "grad_norm_var": 0.002588844299316406, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.501651883125305, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19135171175003052, "step": 7411 }, { "epoch": 0.11067724859824249, "grad_norm": 0.30859375, "grad_norm_var": 0.0025578816731770832, "learning_rate": 0.0001, "loss": 1.3267, "loss/crossentropy": 2.5677082538604736, "loss/fcd": 1.1640625, "loss/idx": 10.0, "loss/logits": 0.16266455501317978, "step": 7412 }, { "epoch": 0.11069218076885746, "grad_norm": 0.345703125, "grad_norm_var": 0.00249786376953125, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.6100562810897827, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18260592967271805, "step": 7413 }, { "epoch": 0.11070711293947244, "grad_norm": 0.330078125, "grad_norm_var": 0.0025664647420247395, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.5117058753967285, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.16158107668161392, "step": 7414 }, { "epoch": 0.11072204511008743, "grad_norm": 0.375, "grad_norm_var": 0.002395486831665039, "learning_rate": 0.0001, "loss": 1.6129, "loss/crossentropy": 2.561347246170044, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.23013072460889816, "step": 7415 }, { "epoch": 0.1107369772807024, "grad_norm": 0.33984375, "grad_norm_var": 0.002402496337890625, "learning_rate": 0.0001, "loss": 1.5651, "loss/crossentropy": 2.5794777870178223, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2291322723031044, "step": 7416 }, { "epoch": 0.1107519094513174, "grad_norm": 0.326171875, "grad_norm_var": 0.0012241204579671223, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.5601061582565308, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1943279132246971, "step": 7417 }, { "epoch": 0.11076684162193237, "grad_norm": 0.3984375, "grad_norm_var": 0.0013262430826822917, "learning_rate": 0.0001, "loss": 1.6757, "loss/crossentropy": 2.5077849626541138, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.2577774375677109, "step": 7418 }, { "epoch": 0.11078177379254735, "grad_norm": 0.341796875, "grad_norm_var": 0.0013430118560791016, "learning_rate": 0.0001, "loss": 1.5479, "loss/crossentropy": 2.3336949348449707, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.22370005398988724, "step": 7419 }, { "epoch": 0.11079670596316234, "grad_norm": 0.373046875, "grad_norm_var": 0.0013544559478759766, "learning_rate": 0.0001, "loss": 1.5983, "loss/crossentropy": 2.3062103986740112, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.19596363604068756, "step": 7420 }, { "epoch": 0.11081163813377731, "grad_norm": 0.32421875, "grad_norm_var": 0.0013945897420247395, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.5457738637924194, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1864972934126854, "step": 7421 }, { "epoch": 0.1108265703043923, "grad_norm": 0.333984375, "grad_norm_var": 0.0014339288075764973, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.630358576774597, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20576277375221252, "step": 7422 }, { "epoch": 0.11084150247500728, "grad_norm": 0.326171875, "grad_norm_var": 0.0010253270467122397, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.5077717304229736, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.18621788173913956, "step": 7423 }, { "epoch": 0.11085643464562227, "grad_norm": 0.326171875, "grad_norm_var": 0.0006500085194905599, "learning_rate": 0.0001, "loss": 1.4563, "loss/crossentropy": 2.585021734237671, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.18290933966636658, "step": 7424 }, { "epoch": 0.11087136681623724, "grad_norm": 0.359375, "grad_norm_var": 0.0006373087565104167, "learning_rate": 0.0001, "loss": 1.4996, "loss/crossentropy": 2.64302659034729, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.1987960785627365, "step": 7425 }, { "epoch": 0.11088629898685222, "grad_norm": 0.326171875, "grad_norm_var": 0.0005543390909830729, "learning_rate": 0.0001, "loss": 1.4759, "loss/crossentropy": 2.602945923805237, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1946776583790779, "step": 7426 }, { "epoch": 0.11090123115746721, "grad_norm": 0.365234375, "grad_norm_var": 0.0005793253580729167, "learning_rate": 0.0001, "loss": 1.3926, "loss/crossentropy": 2.6157068014144897, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.16599440574645996, "step": 7427 }, { "epoch": 0.11091616332808218, "grad_norm": 0.326171875, "grad_norm_var": 0.0005162398020426433, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.7707351446151733, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17966128140687943, "step": 7428 }, { "epoch": 0.11093109549869717, "grad_norm": 0.341796875, "grad_norm_var": 0.0005167484283447266, "learning_rate": 0.0001, "loss": 1.5611, "loss/crossentropy": 2.3313745260238647, "loss/fcd": 1.36328125, "loss/idx": 10.0, "loss/logits": 0.1977924257516861, "step": 7429 }, { "epoch": 0.11094602766931215, "grad_norm": 0.3203125, "grad_norm_var": 0.0005416234334309895, "learning_rate": 0.0001, "loss": 1.4128, "loss/crossentropy": 2.6954604387283325, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18619991838932037, "step": 7430 }, { "epoch": 0.11096095983992713, "grad_norm": 0.33984375, "grad_norm_var": 0.00047353108723958336, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.707864761352539, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.15124812722206116, "step": 7431 }, { "epoch": 0.11097589201054212, "grad_norm": 0.349609375, "grad_norm_var": 0.00047694842020670574, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.576660633087158, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19031942635774612, "step": 7432 }, { "epoch": 0.11099082418115709, "grad_norm": 0.43359375, "grad_norm_var": 0.0009656270345052083, "learning_rate": 0.0001, "loss": 1.573, "loss/crossentropy": 2.711246371269226, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.1940804123878479, "step": 7433 }, { "epoch": 0.11100575635177208, "grad_norm": 0.435546875, "grad_norm_var": 0.0012957096099853516, "learning_rate": 0.0001, "loss": 1.5905, "loss/crossentropy": 2.528927445411682, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2116023749113083, "step": 7434 }, { "epoch": 0.11102068852238706, "grad_norm": 0.369140625, "grad_norm_var": 0.0013072808583577474, "learning_rate": 0.0001, "loss": 1.4262, "loss/crossentropy": 2.6061125993728638, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17229877412319183, "step": 7435 }, { "epoch": 0.11103562069300203, "grad_norm": 0.353515625, "grad_norm_var": 0.001279306411743164, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.697118878364563, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20625346899032593, "step": 7436 }, { "epoch": 0.11105055286361702, "grad_norm": 0.33984375, "grad_norm_var": 0.0012368361155192058, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.6095592975616455, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21418926119804382, "step": 7437 }, { "epoch": 0.111065485034232, "grad_norm": 0.369140625, "grad_norm_var": 0.0012253920237223308, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.6279250383377075, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19676633924245834, "step": 7438 }, { "epoch": 0.11108041720484699, "grad_norm": 0.3671875, "grad_norm_var": 0.001172320048014323, "learning_rate": 0.0001, "loss": 1.6215, "loss/crossentropy": 2.676155924797058, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.23477084934711456, "step": 7439 }, { "epoch": 0.11109534937546196, "grad_norm": 0.353515625, "grad_norm_var": 0.0011042277018229166, "learning_rate": 0.0001, "loss": 1.5682, "loss/crossentropy": 2.640529155731201, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.2283705770969391, "step": 7440 }, { "epoch": 0.11111028154607694, "grad_norm": 0.486328125, "grad_norm_var": 0.0021115461985270183, "learning_rate": 0.0001, "loss": 1.8671, "loss/crossentropy": 2.361229658126831, "loss/fcd": 1.60546875, "loss/idx": 10.0, "loss/logits": 0.261658176779747, "step": 7441 }, { "epoch": 0.11112521371669193, "grad_norm": 0.349609375, "grad_norm_var": 0.0020173231760660807, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.4851168394088745, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18146969377994537, "step": 7442 }, { "epoch": 0.1111401458873069, "grad_norm": 0.359375, "grad_norm_var": 0.002022234598795573, "learning_rate": 0.0001, "loss": 1.4326, "loss/crossentropy": 2.667452335357666, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18261681497097015, "step": 7443 }, { "epoch": 0.1111550780579219, "grad_norm": 0.3515625, "grad_norm_var": 0.0019195397694905599, "learning_rate": 0.0001, "loss": 1.4815, "loss/crossentropy": 2.7554056644439697, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18076714873313904, "step": 7444 }, { "epoch": 0.11117001022853687, "grad_norm": 0.341796875, "grad_norm_var": 0.0019195397694905599, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.698175311088562, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.22138523310422897, "step": 7445 }, { "epoch": 0.11118494239915186, "grad_norm": 0.39453125, "grad_norm_var": 0.001772165298461914, "learning_rate": 0.0001, "loss": 1.8138, "loss/crossentropy": 2.459372878074646, "loss/fcd": 1.55078125, "loss/idx": 10.0, "loss/logits": 0.26299160718917847, "step": 7446 }, { "epoch": 0.11119987456976684, "grad_norm": 0.369140625, "grad_norm_var": 0.001689910888671875, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.723675012588501, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.24200476706027985, "step": 7447 }, { "epoch": 0.11121480674038181, "grad_norm": 0.310546875, "grad_norm_var": 0.0019251505533854167, "learning_rate": 0.0001, "loss": 1.4042, "loss/crossentropy": 2.570651888847351, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1776752918958664, "step": 7448 }, { "epoch": 0.1112297389109968, "grad_norm": 0.3828125, "grad_norm_var": 0.0016829808553059896, "learning_rate": 0.0001, "loss": 1.6163, "loss/crossentropy": 2.724846363067627, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.22961120307445526, "step": 7449 }, { "epoch": 0.11124467108161178, "grad_norm": 0.62109375, "grad_norm_var": 0.005435291926066081, "learning_rate": 0.0001, "loss": 1.6795, "loss/crossentropy": 2.457575559616089, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.22249625623226166, "step": 7450 }, { "epoch": 0.11125960325222677, "grad_norm": 0.33984375, "grad_norm_var": 0.005540911356608073, "learning_rate": 0.0001, "loss": 1.493, "loss/crossentropy": 2.4327532052993774, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2000424936413765, "step": 7451 }, { "epoch": 0.11127453542284174, "grad_norm": 0.427734375, "grad_norm_var": 0.005617014567057292, "learning_rate": 0.0001, "loss": 1.8359, "loss/crossentropy": 2.6044222116470337, "loss/fcd": 1.53515625, "loss/idx": 10.0, "loss/logits": 0.3007691353559494, "step": 7452 }, { "epoch": 0.11128946759345672, "grad_norm": 0.37109375, "grad_norm_var": 0.005488840738932291, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.497132420539856, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21053394675254822, "step": 7453 }, { "epoch": 0.11130439976407171, "grad_norm": 0.3671875, "grad_norm_var": 0.005493783950805664, "learning_rate": 0.0001, "loss": 1.5443, "loss/crossentropy": 2.799384117126465, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.21620384603738785, "step": 7454 }, { "epoch": 0.11131933193468668, "grad_norm": 1.0, "grad_norm_var": 0.028843164443969727, "learning_rate": 0.0001, "loss": 2.1546, "loss/crossentropy": 2.4438945055007935, "loss/fcd": 1.71875, "loss/idx": 10.0, "loss/logits": 0.435879647731781, "step": 7455 }, { "epoch": 0.11133426410530167, "grad_norm": 0.34765625, "grad_norm_var": 0.028902435302734376, "learning_rate": 0.0001, "loss": 1.4972, "loss/crossentropy": 2.4165858030319214, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19253595918416977, "step": 7456 }, { "epoch": 0.11134919627591665, "grad_norm": 0.3203125, "grad_norm_var": 0.029295587539672853, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.4029974937438965, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17521849274635315, "step": 7457 }, { "epoch": 0.11136412844653162, "grad_norm": 0.447265625, "grad_norm_var": 0.029028558731079103, "learning_rate": 0.0001, "loss": 1.6682, "loss/crossentropy": 2.6324546337127686, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.2580636367201805, "step": 7458 }, { "epoch": 0.11137906061714661, "grad_norm": 0.423828125, "grad_norm_var": 0.028750038146972655, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.609572172164917, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17826981842517853, "step": 7459 }, { "epoch": 0.11139399278776159, "grad_norm": 0.39453125, "grad_norm_var": 0.028438822428385416, "learning_rate": 0.0001, "loss": 1.6531, "loss/crossentropy": 2.4963165521621704, "loss/fcd": 1.4140625, "loss/idx": 10.0, "loss/logits": 0.23906520754098892, "step": 7460 }, { "epoch": 0.11140892495837658, "grad_norm": 0.357421875, "grad_norm_var": 0.02827301025390625, "learning_rate": 0.0001, "loss": 1.3809, "loss/crossentropy": 2.7754719257354736, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.16995495557785034, "step": 7461 }, { "epoch": 0.11142385712899155, "grad_norm": 0.416015625, "grad_norm_var": 0.02820115089416504, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.396488904953003, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.23230372369289398, "step": 7462 }, { "epoch": 0.11143878929960653, "grad_norm": 0.375, "grad_norm_var": 0.028154945373535155, "learning_rate": 0.0001, "loss": 1.5342, "loss/crossentropy": 2.5192683935165405, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.19827093929052353, "step": 7463 }, { "epoch": 0.11145372147022152, "grad_norm": 0.3125, "grad_norm_var": 0.02812371253967285, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.473805785179138, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.1606798619031906, "step": 7464 }, { "epoch": 0.1114686536408365, "grad_norm": 0.380859375, "grad_norm_var": 0.028136634826660158, "learning_rate": 0.0001, "loss": 1.571, "loss/crossentropy": 2.613568663597107, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2194547951221466, "step": 7465 }, { "epoch": 0.11148358581145149, "grad_norm": 0.34765625, "grad_norm_var": 0.025893592834472658, "learning_rate": 0.0001, "loss": 1.5881, "loss/crossentropy": 2.7224477529525757, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22871187329292297, "step": 7466 }, { "epoch": 0.11149851798206646, "grad_norm": 0.333984375, "grad_norm_var": 0.025953912734985353, "learning_rate": 0.0001, "loss": 1.4973, "loss/crossentropy": 2.869402050971985, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20042532682418823, "step": 7467 }, { "epoch": 0.11151345015268145, "grad_norm": 0.337890625, "grad_norm_var": 0.026293166478474937, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.683712959289551, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18688421696424484, "step": 7468 }, { "epoch": 0.11152838232329643, "grad_norm": 0.46875, "grad_norm_var": 0.026404428482055663, "learning_rate": 0.0001, "loss": 1.8323, "loss/crossentropy": 2.6003421545028687, "loss/fcd": 1.57421875, "loss/idx": 10.0, "loss/logits": 0.25808506458997726, "step": 7469 }, { "epoch": 0.1115433144939114, "grad_norm": 0.9921875, "grad_norm_var": 0.04688172340393067, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.6715126037597656, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1903572604060173, "step": 7470 }, { "epoch": 0.11155824666452639, "grad_norm": 0.361328125, "grad_norm_var": 0.025836944580078125, "learning_rate": 0.0001, "loss": 1.5642, "loss/crossentropy": 2.639997124671936, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22821369767189026, "step": 7471 }, { "epoch": 0.11157317883514137, "grad_norm": 0.33203125, "grad_norm_var": 0.025989532470703125, "learning_rate": 0.0001, "loss": 1.5246, "loss/crossentropy": 2.4871158599853516, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21604372560977936, "step": 7472 }, { "epoch": 0.11158811100575636, "grad_norm": 0.3203125, "grad_norm_var": 0.025989532470703125, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.6062121391296387, "loss/fcd": 1.14453125, "loss/idx": 10.0, "loss/logits": 0.16126196086406708, "step": 7473 }, { "epoch": 0.11160304317637133, "grad_norm": 0.33203125, "grad_norm_var": 0.026286808649698894, "learning_rate": 0.0001, "loss": 1.3414, "loss/crossentropy": 2.8042536973953247, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.16563095897436142, "step": 7474 }, { "epoch": 0.11161797534698631, "grad_norm": 0.3671875, "grad_norm_var": 0.026348114013671875, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.6085071563720703, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18468184024095535, "step": 7475 }, { "epoch": 0.1116329075176013, "grad_norm": 0.412109375, "grad_norm_var": 0.02635025978088379, "learning_rate": 0.0001, "loss": 1.7722, "loss/crossentropy": 2.909002423286438, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.29562605917453766, "step": 7476 }, { "epoch": 0.11164783968821627, "grad_norm": 0.5078125, "grad_norm_var": 0.02685082753499349, "learning_rate": 0.0001, "loss": 2.1057, "loss/crossentropy": 2.564319372177124, "loss/fcd": 1.75, "loss/idx": 10.0, "loss/logits": 0.35574010014533997, "step": 7477 }, { "epoch": 0.11166277185883126, "grad_norm": 0.443359375, "grad_norm_var": 0.026910909016927085, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.6000412702560425, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1664896383881569, "step": 7478 }, { "epoch": 0.11167770402944624, "grad_norm": 0.3671875, "grad_norm_var": 0.026955413818359374, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.4870084524154663, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2112523540854454, "step": 7479 }, { "epoch": 0.11169263620006122, "grad_norm": 0.404296875, "grad_norm_var": 0.026244974136352538, "learning_rate": 0.0001, "loss": 1.5168, "loss/crossentropy": 2.986755609512329, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.200348399579525, "step": 7480 }, { "epoch": 0.1117075683706762, "grad_norm": 0.53515625, "grad_norm_var": 0.026941871643066405, "learning_rate": 0.0001, "loss": 1.5837, "loss/crossentropy": 2.5193322896957397, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22429795563220978, "step": 7481 }, { "epoch": 0.11172250054129118, "grad_norm": 0.419921875, "grad_norm_var": 0.026484918594360352, "learning_rate": 0.0001, "loss": 1.5609, "loss/crossentropy": 2.5866695642471313, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.19369689375162125, "step": 7482 }, { "epoch": 0.11173743271190617, "grad_norm": 0.412109375, "grad_norm_var": 0.02583006223042806, "learning_rate": 0.0001, "loss": 1.5324, "loss/crossentropy": 2.3885011672973633, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.208208367228508, "step": 7483 }, { "epoch": 0.11175236488252115, "grad_norm": 0.38671875, "grad_norm_var": 0.02532501220703125, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.521067261695862, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.20420119166374207, "step": 7484 }, { "epoch": 0.11176729705313614, "grad_norm": 0.37890625, "grad_norm_var": 0.02550195058186849, "learning_rate": 0.0001, "loss": 1.5336, "loss/crossentropy": 2.94169819355011, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.22110223770141602, "step": 7485 }, { "epoch": 0.11178222922375111, "grad_norm": 0.43359375, "grad_norm_var": 0.003563690185546875, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.5341427326202393, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19776053726673126, "step": 7486 }, { "epoch": 0.11179716139436609, "grad_norm": 0.35546875, "grad_norm_var": 0.0035967350006103514, "learning_rate": 0.0001, "loss": 1.4131, "loss/crossentropy": 2.6444965600967407, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17873164266347885, "step": 7487 }, { "epoch": 0.11181209356498108, "grad_norm": 0.34375, "grad_norm_var": 0.0034983158111572266, "learning_rate": 0.0001, "loss": 1.4927, "loss/crossentropy": 2.56127393245697, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2036653533577919, "step": 7488 }, { "epoch": 0.11182702573559605, "grad_norm": 0.349609375, "grad_norm_var": 0.0032358169555664062, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.432770848274231, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.18483111262321472, "step": 7489 }, { "epoch": 0.11184195790621104, "grad_norm": 0.31640625, "grad_norm_var": 0.003399085998535156, "learning_rate": 0.0001, "loss": 1.3699, "loss/crossentropy": 2.509089469909668, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17068412899971008, "step": 7490 }, { "epoch": 0.11185689007682602, "grad_norm": 0.337890625, "grad_norm_var": 0.0035891056060791014, "learning_rate": 0.0001, "loss": 1.3175, "loss/crossentropy": 2.623411774635315, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.16518432646989822, "step": 7491 }, { "epoch": 0.111871822247441, "grad_norm": 0.36328125, "grad_norm_var": 0.003661028544108073, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.6522737741470337, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19466756284236908, "step": 7492 }, { "epoch": 0.11188675441805598, "grad_norm": 0.359375, "grad_norm_var": 0.0028492609659830728, "learning_rate": 0.0001, "loss": 1.5539, "loss/crossentropy": 2.568835973739624, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2101435363292694, "step": 7493 }, { "epoch": 0.11190168658867096, "grad_norm": 0.337890625, "grad_norm_var": 0.0027651468912760417, "learning_rate": 0.0001, "loss": 1.3961, "loss/crossentropy": 2.849856972694397, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.1773257628083229, "step": 7494 }, { "epoch": 0.11191661875928595, "grad_norm": 0.390625, "grad_norm_var": 0.0027552286783854166, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.6659839153289795, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20627011358737946, "step": 7495 }, { "epoch": 0.11193155092990092, "grad_norm": 0.50390625, "grad_norm_var": 0.003660694758097331, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.6049578189849854, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1954241767525673, "step": 7496 }, { "epoch": 0.1119464831005159, "grad_norm": 0.37109375, "grad_norm_var": 0.002146641413370768, "learning_rate": 0.0001, "loss": 1.4845, "loss/crossentropy": 2.8126503229141235, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19541793316602707, "step": 7497 }, { "epoch": 0.11196141527113089, "grad_norm": 0.361328125, "grad_norm_var": 0.002039829889933268, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.6330467462539673, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.24079225212335587, "step": 7498 }, { "epoch": 0.11197634744174587, "grad_norm": 0.359375, "grad_norm_var": 0.001953570048014323, "learning_rate": 0.0001, "loss": 1.5078, "loss/crossentropy": 2.825756549835205, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.21874761581420898, "step": 7499 }, { "epoch": 0.11199127961236086, "grad_norm": 0.5390625, "grad_norm_var": 0.0037066141764322915, "learning_rate": 0.0001, "loss": 2.0742, "loss/crossentropy": 2.6656627655029297, "loss/fcd": 1.70703125, "loss/idx": 10.0, "loss/logits": 0.36712757498025894, "step": 7500 }, { "epoch": 0.11200621178297583, "grad_norm": 0.3671875, "grad_norm_var": 0.0037190119425455728, "learning_rate": 0.0001, "loss": 1.6968, "loss/crossentropy": 2.6271332502365112, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.25934265553951263, "step": 7501 }, { "epoch": 0.11202114395359081, "grad_norm": 0.3203125, "grad_norm_var": 0.003720855712890625, "learning_rate": 0.0001, "loss": 1.4763, "loss/crossentropy": 2.4301823377609253, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.19110485911369324, "step": 7502 }, { "epoch": 0.1120360761242058, "grad_norm": 0.333984375, "grad_norm_var": 0.0038014570871988933, "learning_rate": 0.0001, "loss": 1.5738, "loss/crossentropy": 2.590323805809021, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22617898136377335, "step": 7503 }, { "epoch": 0.11205100829482077, "grad_norm": 0.314453125, "grad_norm_var": 0.003966204325358073, "learning_rate": 0.0001, "loss": 1.3483, "loss/crossentropy": 2.6497113704681396, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.16466236859560013, "step": 7504 }, { "epoch": 0.11206594046543576, "grad_norm": 0.390625, "grad_norm_var": 0.003957859675089518, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.5980029106140137, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20105019956827164, "step": 7505 }, { "epoch": 0.11208087263605074, "grad_norm": 0.322265625, "grad_norm_var": 0.003915850321451823, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.6722984313964844, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21284828335046768, "step": 7506 }, { "epoch": 0.11209580480666573, "grad_norm": 0.4609375, "grad_norm_var": 0.004281346003214518, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 2.6323097944259644, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2167246863245964, "step": 7507 }, { "epoch": 0.1121107369772807, "grad_norm": 0.341796875, "grad_norm_var": 0.00436089833577474, "learning_rate": 0.0001, "loss": 1.3251, "loss/crossentropy": 2.759631633758545, "loss/fcd": 1.16015625, "loss/idx": 10.0, "loss/logits": 0.16492003947496414, "step": 7508 }, { "epoch": 0.11212566914789568, "grad_norm": 0.462890625, "grad_norm_var": 0.004750935236612955, "learning_rate": 0.0001, "loss": 1.7741, "loss/crossentropy": 2.4271132946014404, "loss/fcd": 1.5234375, "loss/idx": 10.0, "loss/logits": 0.2506364956498146, "step": 7509 }, { "epoch": 0.11214060131851067, "grad_norm": 0.3359375, "grad_norm_var": 0.0047637303670247395, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.434825897216797, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18796394020318985, "step": 7510 }, { "epoch": 0.11215553348912564, "grad_norm": 0.310546875, "grad_norm_var": 0.005114984512329101, "learning_rate": 0.0001, "loss": 1.4652, "loss/crossentropy": 2.7525110244750977, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.19562116265296936, "step": 7511 }, { "epoch": 0.11217046565974063, "grad_norm": 0.3671875, "grad_norm_var": 0.004042418797810873, "learning_rate": 0.0001, "loss": 1.4746, "loss/crossentropy": 2.6376209259033203, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.1894397884607315, "step": 7512 }, { "epoch": 0.11218539783035561, "grad_norm": 0.357421875, "grad_norm_var": 0.004056549072265625, "learning_rate": 0.0001, "loss": 1.3911, "loss/crossentropy": 2.5586835145950317, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.16846145689487457, "step": 7513 }, { "epoch": 0.11220033000097059, "grad_norm": 0.46484375, "grad_norm_var": 0.004584741592407226, "learning_rate": 0.0001, "loss": 1.6409, "loss/crossentropy": 2.529021739959717, "loss/fcd": 1.40234375, "loss/idx": 10.0, "loss/logits": 0.2385615110397339, "step": 7514 }, { "epoch": 0.11221526217158558, "grad_norm": 0.33984375, "grad_norm_var": 0.004657220840454101, "learning_rate": 0.0001, "loss": 1.4249, "loss/crossentropy": 2.705923914909363, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18663321435451508, "step": 7515 }, { "epoch": 0.11223019434220055, "grad_norm": 0.330078125, "grad_norm_var": 0.002866363525390625, "learning_rate": 0.0001, "loss": 1.4157, "loss/crossentropy": 2.505809783935547, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1734725683927536, "step": 7516 }, { "epoch": 0.11224512651281554, "grad_norm": 0.3203125, "grad_norm_var": 0.002982330322265625, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.7481101751327515, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.1792338490486145, "step": 7517 }, { "epoch": 0.11226005868343052, "grad_norm": 0.294921875, "grad_norm_var": 0.003159825007120768, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.4279061555862427, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17752324789762497, "step": 7518 }, { "epoch": 0.11227499085404549, "grad_norm": 0.337890625, "grad_norm_var": 0.0031476179758707684, "learning_rate": 0.0001, "loss": 1.3961, "loss/crossentropy": 2.6817389726638794, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18511834740638733, "step": 7519 }, { "epoch": 0.11228992302466048, "grad_norm": 0.38671875, "grad_norm_var": 0.0030399958292643228, "learning_rate": 0.0001, "loss": 1.5294, "loss/crossentropy": 2.6547160148620605, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20522049814462662, "step": 7520 }, { "epoch": 0.11230485519527546, "grad_norm": 0.349609375, "grad_norm_var": 0.002999607721964518, "learning_rate": 0.0001, "loss": 1.5782, "loss/crossentropy": 2.3619325160980225, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22275033593177795, "step": 7521 }, { "epoch": 0.11231978736589045, "grad_norm": 0.36328125, "grad_norm_var": 0.002890459696451823, "learning_rate": 0.0001, "loss": 1.4985, "loss/crossentropy": 2.4343624114990234, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19384663552045822, "step": 7522 }, { "epoch": 0.11233471953650542, "grad_norm": 0.36328125, "grad_norm_var": 0.0022244771321614583, "learning_rate": 0.0001, "loss": 1.6591, "loss/crossentropy": 2.5369179248809814, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.22944732010364532, "step": 7523 }, { "epoch": 0.1123496517071204, "grad_norm": 0.349609375, "grad_norm_var": 0.0022115071614583334, "learning_rate": 0.0001, "loss": 1.5021, "loss/crossentropy": 2.679864525794983, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20519434660673141, "step": 7524 }, { "epoch": 0.11236458387773539, "grad_norm": 0.455078125, "grad_norm_var": 0.002106475830078125, "learning_rate": 0.0001, "loss": 2.0407, "loss/crossentropy": 2.5488322973251343, "loss/fcd": 1.6796875, "loss/idx": 10.0, "loss/logits": 0.3609641492366791, "step": 7525 }, { "epoch": 0.11237951604835036, "grad_norm": 0.5546875, "grad_norm_var": 0.004456329345703125, "learning_rate": 0.0001, "loss": 1.9634, "loss/crossentropy": 2.655510425567627, "loss/fcd": 1.72265625, "loss/idx": 10.0, "loss/logits": 0.2407132387161255, "step": 7526 }, { "epoch": 0.11239444821896535, "grad_norm": 0.353515625, "grad_norm_var": 0.0042220433553059895, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.5535558462142944, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19290615618228912, "step": 7527 }, { "epoch": 0.11240938038958033, "grad_norm": 0.333984375, "grad_norm_var": 0.0043222904205322266, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.5714502334594727, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.1603843793272972, "step": 7528 }, { "epoch": 0.11242431256019532, "grad_norm": 0.37890625, "grad_norm_var": 0.004308827718098958, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.6963073015213013, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20643409341573715, "step": 7529 }, { "epoch": 0.1124392447308103, "grad_norm": 0.375, "grad_norm_var": 0.003719520568847656, "learning_rate": 0.0001, "loss": 1.4801, "loss/crossentropy": 2.376649260520935, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.18711896985769272, "step": 7530 }, { "epoch": 0.11245417690142527, "grad_norm": 0.322265625, "grad_norm_var": 0.003804636001586914, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.5215197801589966, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.20121105015277863, "step": 7531 }, { "epoch": 0.11246910907204026, "grad_norm": 0.322265625, "grad_norm_var": 0.00384672482808431, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.6382672786712646, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.1821639984846115, "step": 7532 }, { "epoch": 0.11248404124265524, "grad_norm": 0.396484375, "grad_norm_var": 0.003741963704427083, "learning_rate": 0.0001, "loss": 1.546, "loss/crossentropy": 2.761801838874817, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21005111932754517, "step": 7533 }, { "epoch": 0.11249897341327023, "grad_norm": 0.30859375, "grad_norm_var": 0.0036147912343343098, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.439925193786621, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.18002507835626602, "step": 7534 }, { "epoch": 0.1125139055838852, "grad_norm": 0.328125, "grad_norm_var": 0.0036650975545247395, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.4258999824523926, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16036292910575867, "step": 7535 }, { "epoch": 0.11252883775450018, "grad_norm": 0.333984375, "grad_norm_var": 0.0037307580312093097, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.546040892601013, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1976020261645317, "step": 7536 }, { "epoch": 0.11254376992511517, "grad_norm": 0.392578125, "grad_norm_var": 0.003740549087524414, "learning_rate": 0.0001, "loss": 1.4607, "loss/crossentropy": 2.4466283321380615, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1833907887339592, "step": 7537 }, { "epoch": 0.11255870209573014, "grad_norm": 0.375, "grad_norm_var": 0.0037374973297119142, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.748242497444153, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17055577039718628, "step": 7538 }, { "epoch": 0.11257363426634513, "grad_norm": 0.322265625, "grad_norm_var": 0.003887367248535156, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.5520130395889282, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17951375246047974, "step": 7539 }, { "epoch": 0.11258856643696011, "grad_norm": 0.353515625, "grad_norm_var": 0.0038782755533854165, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.615869402885437, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.2059720754623413, "step": 7540 }, { "epoch": 0.11260349860757508, "grad_norm": 0.388671875, "grad_norm_var": 0.0033929824829101564, "learning_rate": 0.0001, "loss": 1.6598, "loss/crossentropy": 2.444947600364685, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.22620829939842224, "step": 7541 }, { "epoch": 0.11261843077819007, "grad_norm": 0.375, "grad_norm_var": 0.0008661270141601563, "learning_rate": 0.0001, "loss": 1.4629, "loss/crossentropy": 2.5929102897644043, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1933564692735672, "step": 7542 }, { "epoch": 0.11263336294880505, "grad_norm": 0.34765625, "grad_norm_var": 0.0008684635162353515, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.4443302154541016, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17396005988121033, "step": 7543 }, { "epoch": 0.11264829511942004, "grad_norm": 0.330078125, "grad_norm_var": 0.0008795261383056641, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.4914519786834717, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.1725383773446083, "step": 7544 }, { "epoch": 0.11266322729003501, "grad_norm": 0.3671875, "grad_norm_var": 0.0008478641510009765, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.539816975593567, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.22641142457723618, "step": 7545 }, { "epoch": 0.11267815946065, "grad_norm": 0.361328125, "grad_norm_var": 0.0008183797200520833, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.6123056411743164, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20381879061460495, "step": 7546 }, { "epoch": 0.11269309163126498, "grad_norm": 0.341796875, "grad_norm_var": 0.0007659276326497396, "learning_rate": 0.0001, "loss": 1.4973, "loss/crossentropy": 2.6143064498901367, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20428241044282913, "step": 7547 }, { "epoch": 0.11270802380187996, "grad_norm": 0.322265625, "grad_norm_var": 0.0007659276326497396, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.4490764141082764, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.18829601258039474, "step": 7548 }, { "epoch": 0.11272295597249495, "grad_norm": 0.333984375, "grad_norm_var": 0.0006458918253580729, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.6752618551254272, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20779096335172653, "step": 7549 }, { "epoch": 0.11273788814310992, "grad_norm": 0.32421875, "grad_norm_var": 0.0005772272745768229, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.6371442079544067, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21656185388565063, "step": 7550 }, { "epoch": 0.11275282031372491, "grad_norm": 0.345703125, "grad_norm_var": 0.0005456129709879557, "learning_rate": 0.0001, "loss": 1.411, "loss/crossentropy": 2.7250006198883057, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17667391896247864, "step": 7551 }, { "epoch": 0.11276775248433989, "grad_norm": 0.388671875, "grad_norm_var": 0.0006088097890218099, "learning_rate": 0.0001, "loss": 1.5893, "loss/crossentropy": 2.747185230255127, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.20653259754180908, "step": 7552 }, { "epoch": 0.11278268465495486, "grad_norm": 0.32421875, "grad_norm_var": 0.0005526224772135417, "learning_rate": 0.0001, "loss": 1.3613, "loss/crossentropy": 2.5497753620147705, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16597864776849747, "step": 7553 }, { "epoch": 0.11279761682556985, "grad_norm": 0.388671875, "grad_norm_var": 0.0006096998850504558, "learning_rate": 0.0001, "loss": 1.5725, "loss/crossentropy": 2.50674307346344, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21700559556484222, "step": 7554 }, { "epoch": 0.11281254899618483, "grad_norm": 0.32421875, "grad_norm_var": 0.0006024678548177083, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.577783942222595, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.1948516070842743, "step": 7555 }, { "epoch": 0.11282748116679982, "grad_norm": 0.4140625, "grad_norm_var": 0.000851297378540039, "learning_rate": 0.0001, "loss": 1.4703, "loss/crossentropy": 2.5393792390823364, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1890968233346939, "step": 7556 }, { "epoch": 0.1128424133374148, "grad_norm": 0.34375, "grad_norm_var": 0.0007748921712239583, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.4150360822677612, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20617622882127762, "step": 7557 }, { "epoch": 0.11285734550802977, "grad_norm": 0.345703125, "grad_norm_var": 0.0007388909657796224, "learning_rate": 0.0001, "loss": 1.5144, "loss/crossentropy": 2.488414764404297, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2058209329843521, "step": 7558 }, { "epoch": 0.11287227767864476, "grad_norm": 0.3515625, "grad_norm_var": 0.0007385094960530599, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.252167820930481, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.197721965610981, "step": 7559 }, { "epoch": 0.11288720984925973, "grad_norm": 0.357421875, "grad_norm_var": 0.0007109165191650391, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.6435850858688354, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.2042543664574623, "step": 7560 }, { "epoch": 0.11290214201987472, "grad_norm": 0.423828125, "grad_norm_var": 0.0010248184204101562, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.46798038482666, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18729573488235474, "step": 7561 }, { "epoch": 0.1129170741904897, "grad_norm": 0.33984375, "grad_norm_var": 0.00103758176167806, "learning_rate": 0.0001, "loss": 1.4574, "loss/crossentropy": 3.0250786542892456, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19962668418884277, "step": 7562 }, { "epoch": 0.11293200636110468, "grad_norm": 0.365234375, "grad_norm_var": 0.0010326226552327474, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.718279242515564, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.17649059742689133, "step": 7563 }, { "epoch": 0.11294693853171967, "grad_norm": 0.349609375, "grad_norm_var": 0.0009569644927978516, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.6552584171295166, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22587242722511292, "step": 7564 }, { "epoch": 0.11296187070233464, "grad_norm": 0.349609375, "grad_norm_var": 0.0009231408437093099, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.6286141872406006, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20110075920820236, "step": 7565 }, { "epoch": 0.11297680287294963, "grad_norm": 0.365234375, "grad_norm_var": 0.0008406956990559896, "learning_rate": 0.0001, "loss": 1.6588, "loss/crossentropy": 2.268960952758789, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.21345620602369308, "step": 7566 }, { "epoch": 0.1129917350435646, "grad_norm": 0.337890625, "grad_norm_var": 0.0008605321248372396, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.4316976070404053, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17308499664068222, "step": 7567 }, { "epoch": 0.1130066672141796, "grad_norm": 0.365234375, "grad_norm_var": 0.0008071263631184896, "learning_rate": 0.0001, "loss": 1.5708, "loss/crossentropy": 2.463336944580078, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21141773462295532, "step": 7568 }, { "epoch": 0.11302159938479457, "grad_norm": 0.34375, "grad_norm_var": 0.00074005126953125, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.497198700904846, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.175221286714077, "step": 7569 }, { "epoch": 0.11303653155540955, "grad_norm": 0.318359375, "grad_norm_var": 0.000783538818359375, "learning_rate": 0.0001, "loss": 1.503, "loss/crossentropy": 2.6069939136505127, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19830022752285004, "step": 7570 }, { "epoch": 0.11305146372602454, "grad_norm": 0.33203125, "grad_norm_var": 0.0007542928059895833, "learning_rate": 0.0001, "loss": 1.5457, "loss/crossentropy": 2.6618306636810303, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20973225682973862, "step": 7571 }, { "epoch": 0.11306639589663951, "grad_norm": 0.369140625, "grad_norm_var": 0.0005353132883707683, "learning_rate": 0.0001, "loss": 1.4735, "loss/crossentropy": 2.4169363975524902, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1922794207930565, "step": 7572 }, { "epoch": 0.1130813280672545, "grad_norm": 0.447265625, "grad_norm_var": 0.001068560282389323, "learning_rate": 0.0001, "loss": 1.5955, "loss/crossentropy": 2.590570330619812, "loss/fcd": 1.3828125, "loss/idx": 10.0, "loss/logits": 0.2126927375793457, "step": 7573 }, { "epoch": 0.11309626023786948, "grad_norm": 0.349609375, "grad_norm_var": 0.00106201171875, "learning_rate": 0.0001, "loss": 1.6218, "loss/crossentropy": 2.471687436103821, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.23121990263462067, "step": 7574 }, { "epoch": 0.11311119240848445, "grad_norm": 0.373046875, "grad_norm_var": 0.001065683364868164, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.717391610145569, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2042425349354744, "step": 7575 }, { "epoch": 0.11312612457909944, "grad_norm": 0.400390625, "grad_norm_var": 0.0011566003163655599, "learning_rate": 0.0001, "loss": 1.5899, "loss/crossentropy": 2.5235416889190674, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.21885528415441513, "step": 7576 }, { "epoch": 0.11314105674971442, "grad_norm": 0.34375, "grad_norm_var": 0.0009226481119791667, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.7505041360855103, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17831328511238098, "step": 7577 }, { "epoch": 0.11315598892032941, "grad_norm": 0.333984375, "grad_norm_var": 0.0009400526682535808, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.617835521697998, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.20668243616819382, "step": 7578 }, { "epoch": 0.11317092109094438, "grad_norm": 0.333984375, "grad_norm_var": 0.0009751478830973307, "learning_rate": 0.0001, "loss": 1.5246, "loss/crossentropy": 2.5068256855010986, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.2160344123840332, "step": 7579 }, { "epoch": 0.11318585326155936, "grad_norm": 0.361328125, "grad_norm_var": 0.0009720961252848308, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.6521915197372437, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.23039362579584122, "step": 7580 }, { "epoch": 0.11320078543217435, "grad_norm": 0.390625, "grad_norm_var": 0.0010325113932291667, "learning_rate": 0.0001, "loss": 1.7921, "loss/crossentropy": 2.402975082397461, "loss/fcd": 1.51953125, "loss/idx": 10.0, "loss/logits": 0.2725226655602455, "step": 7581 }, { "epoch": 0.11321571760278933, "grad_norm": 0.5078125, "grad_norm_var": 0.0023958683013916016, "learning_rate": 0.0001, "loss": 2.2128, "loss/crossentropy": 2.322152256965637, "loss/fcd": 1.76171875, "loss/idx": 10.0, "loss/logits": 0.4510797709226608, "step": 7582 }, { "epoch": 0.11323064977340432, "grad_norm": 0.427734375, "grad_norm_var": 0.002524550755818685, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.57879638671875, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.18387748301029205, "step": 7583 }, { "epoch": 0.11324558194401929, "grad_norm": 0.322265625, "grad_norm_var": 0.0026951948801676433, "learning_rate": 0.0001, "loss": 1.5603, "loss/crossentropy": 2.4468079805374146, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22828862816095352, "step": 7584 }, { "epoch": 0.11326051411463427, "grad_norm": 0.4140625, "grad_norm_var": 0.002737538019816081, "learning_rate": 0.0001, "loss": 1.5168, "loss/crossentropy": 2.454715609550476, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.19253990799188614, "step": 7585 }, { "epoch": 0.11327544628524926, "grad_norm": 0.322265625, "grad_norm_var": 0.0027081648508707683, "learning_rate": 0.0001, "loss": 1.4697, "loss/crossentropy": 2.6753989458084106, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.21184999495744705, "step": 7586 }, { "epoch": 0.11329037845586423, "grad_norm": 0.412109375, "grad_norm_var": 0.002630615234375, "learning_rate": 0.0001, "loss": 1.7228, "loss/crossentropy": 2.6366368532180786, "loss/fcd": 1.4765625, "loss/idx": 10.0, "loss/logits": 0.2462758868932724, "step": 7587 }, { "epoch": 0.11330531062647922, "grad_norm": 0.326171875, "grad_norm_var": 0.0028187433878580728, "learning_rate": 0.0001, "loss": 1.4908, "loss/crossentropy": 2.557676911354065, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20170576870441437, "step": 7588 }, { "epoch": 0.1133202427970942, "grad_norm": 0.3515625, "grad_norm_var": 0.0025220076243082684, "learning_rate": 0.0001, "loss": 1.5528, "loss/crossentropy": 2.427956700325012, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22076010704040527, "step": 7589 }, { "epoch": 0.11333517496770919, "grad_norm": 0.318359375, "grad_norm_var": 0.0026812076568603514, "learning_rate": 0.0001, "loss": 1.3812, "loss/crossentropy": 2.571128726005554, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17414632439613342, "step": 7590 }, { "epoch": 0.11335010713832416, "grad_norm": 0.396484375, "grad_norm_var": 0.002721261978149414, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.6707552671432495, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.24224217236042023, "step": 7591 }, { "epoch": 0.11336503930893914, "grad_norm": 0.42578125, "grad_norm_var": 0.002855364481608073, "learning_rate": 0.0001, "loss": 1.6725, "loss/crossentropy": 2.9102877378463745, "loss/fcd": 1.4375, "loss/idx": 10.0, "loss/logits": 0.23498935997486115, "step": 7592 }, { "epoch": 0.11337997147955413, "grad_norm": 0.376953125, "grad_norm_var": 0.002789163589477539, "learning_rate": 0.0001, "loss": 1.4429, "loss/crossentropy": 2.5057936906814575, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18512943387031555, "step": 7593 }, { "epoch": 0.1133949036501691, "grad_norm": 0.3828125, "grad_norm_var": 0.002662404378255208, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.5801864862442017, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.1859719604253769, "step": 7594 }, { "epoch": 0.1134098358207841, "grad_norm": 0.349609375, "grad_norm_var": 0.0025830586751302083, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.5944459438323975, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17581099271774292, "step": 7595 }, { "epoch": 0.11342476799139907, "grad_norm": 0.392578125, "grad_norm_var": 0.0025647481282552085, "learning_rate": 0.0001, "loss": 1.4711, "loss/crossentropy": 2.6738702058792114, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.19372856616973877, "step": 7596 }, { "epoch": 0.11343970016201405, "grad_norm": 0.35546875, "grad_norm_var": 0.0026030858357747396, "learning_rate": 0.0001, "loss": 1.5155, "loss/crossentropy": 2.635720372200012, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21079040318727493, "step": 7597 }, { "epoch": 0.11345463233262904, "grad_norm": 0.318359375, "grad_norm_var": 0.0016209761301676433, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.3652749061584473, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17370834946632385, "step": 7598 }, { "epoch": 0.11346956450324401, "grad_norm": 0.328125, "grad_norm_var": 0.0014515558878580729, "learning_rate": 0.0001, "loss": 1.3641, "loss/crossentropy": 2.7271522283554077, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16881989687681198, "step": 7599 }, { "epoch": 0.113484496673859, "grad_norm": 0.322265625, "grad_norm_var": 0.0014515558878580729, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.7012282609939575, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1807771474123001, "step": 7600 }, { "epoch": 0.11349942884447398, "grad_norm": 0.34765625, "grad_norm_var": 0.0012667338053385417, "learning_rate": 0.0001, "loss": 1.4033, "loss/crossentropy": 2.5307151079177856, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.16503866016864777, "step": 7601 }, { "epoch": 0.11351436101508895, "grad_norm": 0.3828125, "grad_norm_var": 0.0012080987294514974, "learning_rate": 0.0001, "loss": 1.6118, "loss/crossentropy": 2.412170886993408, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2368343397974968, "step": 7602 }, { "epoch": 0.11352929318570394, "grad_norm": 0.3359375, "grad_norm_var": 0.0010587056477864584, "learning_rate": 0.0001, "loss": 1.3911, "loss/crossentropy": 2.603402614593506, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17629308253526688, "step": 7603 }, { "epoch": 0.11354422535631892, "grad_norm": 0.3359375, "grad_norm_var": 0.001024611790974935, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.6515352725982666, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19258858263492584, "step": 7604 }, { "epoch": 0.11355915752693391, "grad_norm": 0.34375, "grad_norm_var": 0.0010346571604410808, "learning_rate": 0.0001, "loss": 1.5294, "loss/crossentropy": 2.7835947275161743, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20903890579938889, "step": 7605 }, { "epoch": 0.11357408969754888, "grad_norm": 0.384765625, "grad_norm_var": 0.0009676456451416015, "learning_rate": 0.0001, "loss": 1.7636, "loss/crossentropy": 2.3240004777908325, "loss/fcd": 1.5078125, "loss/idx": 10.0, "loss/logits": 0.25575895607471466, "step": 7606 }, { "epoch": 0.11358902186816387, "grad_norm": 0.314453125, "grad_norm_var": 0.001002359390258789, "learning_rate": 0.0001, "loss": 1.3729, "loss/crossentropy": 2.5236655473709106, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17370004206895828, "step": 7607 }, { "epoch": 0.11360395403877885, "grad_norm": 0.353515625, "grad_norm_var": 0.0006571451822916667, "learning_rate": 0.0001, "loss": 1.5136, "loss/crossentropy": 2.53687059879303, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.1933315098285675, "step": 7608 }, { "epoch": 0.11361888620939382, "grad_norm": 0.36328125, "grad_norm_var": 0.0006225426991780599, "learning_rate": 0.0001, "loss": 1.4025, "loss/crossentropy": 2.8077008724212646, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1720079630613327, "step": 7609 }, { "epoch": 0.11363381838000881, "grad_norm": 0.330078125, "grad_norm_var": 0.0005706151326497396, "learning_rate": 0.0001, "loss": 1.3866, "loss/crossentropy": 2.613183856010437, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17174173891544342, "step": 7610 }, { "epoch": 0.11364875055062379, "grad_norm": 0.33203125, "grad_norm_var": 0.0005847771962483723, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.5787192583084106, "loss/fcd": 1.15625, "loss/idx": 10.0, "loss/logits": 0.151928149163723, "step": 7611 }, { "epoch": 0.11366368272123878, "grad_norm": 0.439453125, "grad_norm_var": 0.0010112603505452475, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.7264750003814697, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.18042220175266266, "step": 7612 }, { "epoch": 0.11367861489185375, "grad_norm": 0.392578125, "grad_norm_var": 0.0011281331380208334, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.890534996986389, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.20727279782295227, "step": 7613 }, { "epoch": 0.11369354706246873, "grad_norm": 0.34765625, "grad_norm_var": 0.001052077611287435, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.5693920850753784, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20305275917053223, "step": 7614 }, { "epoch": 0.11370847923308372, "grad_norm": 0.35546875, "grad_norm_var": 0.0010066827138264973, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.4274264574050903, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20070522278547287, "step": 7615 }, { "epoch": 0.1137234114036987, "grad_norm": 0.404296875, "grad_norm_var": 0.00106809933980306, "learning_rate": 0.0001, "loss": 1.5615, "loss/crossentropy": 2.553189754486084, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.20213793963193893, "step": 7616 }, { "epoch": 0.11373834357431369, "grad_norm": 0.31640625, "grad_norm_var": 0.0011815230051676433, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.6855313777923584, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17931875586509705, "step": 7617 }, { "epoch": 0.11375327574492866, "grad_norm": 0.333984375, "grad_norm_var": 0.001170794169108073, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.575944662094116, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19614562392234802, "step": 7618 }, { "epoch": 0.11376820791554364, "grad_norm": 0.373046875, "grad_norm_var": 0.0011614322662353515, "learning_rate": 0.0001, "loss": 1.6315, "loss/crossentropy": 2.4765747785568237, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.2603796124458313, "step": 7619 }, { "epoch": 0.11378314008615863, "grad_norm": 0.345703125, "grad_norm_var": 0.0011392593383789062, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.564499020576477, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20829399675130844, "step": 7620 }, { "epoch": 0.1137980722567736, "grad_norm": 0.345703125, "grad_norm_var": 0.0011357466379801433, "learning_rate": 0.0001, "loss": 1.5773, "loss/crossentropy": 2.6539132595062256, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2218562290072441, "step": 7621 }, { "epoch": 0.11381300442738859, "grad_norm": 0.31640625, "grad_norm_var": 0.001186370849609375, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.6711466312408447, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.17408640682697296, "step": 7622 }, { "epoch": 0.11382793659800357, "grad_norm": 0.322265625, "grad_norm_var": 0.00114898681640625, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.76636803150177, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.22000323981046677, "step": 7623 }, { "epoch": 0.11384286876861854, "grad_norm": 0.3125, "grad_norm_var": 0.0012594699859619141, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.7163718938827515, "loss/fcd": 1.1796875, "loss/idx": 10.0, "loss/logits": 0.1724950224161148, "step": 7624 }, { "epoch": 0.11385780093923353, "grad_norm": 0.330078125, "grad_norm_var": 0.0012781143188476563, "learning_rate": 0.0001, "loss": 1.5514, "loss/crossentropy": 2.8054680824279785, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22328966856002808, "step": 7625 }, { "epoch": 0.11387273310984851, "grad_norm": 0.3515625, "grad_norm_var": 0.001250314712524414, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.6704416275024414, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.17770209908485413, "step": 7626 }, { "epoch": 0.1138876652804635, "grad_norm": 0.32421875, "grad_norm_var": 0.00127409299214681, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.6449066400527954, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20920579135417938, "step": 7627 }, { "epoch": 0.11390259745107847, "grad_norm": 0.36328125, "grad_norm_var": 0.0007354100545247396, "learning_rate": 0.0001, "loss": 1.5332, "loss/crossentropy": 2.510986089706421, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.20120149850845337, "step": 7628 }, { "epoch": 0.11391752962169346, "grad_norm": 0.369140625, "grad_norm_var": 0.0006240208943684896, "learning_rate": 0.0001, "loss": 1.5432, "loss/crossentropy": 2.530715584754944, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.20337103307247162, "step": 7629 }, { "epoch": 0.11393246179230844, "grad_norm": 0.34765625, "grad_norm_var": 0.0006240208943684896, "learning_rate": 0.0001, "loss": 1.6968, "loss/crossentropy": 2.178926944732666, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.2553730905056, "step": 7630 }, { "epoch": 0.11394739396292342, "grad_norm": 0.306640625, "grad_norm_var": 0.0007015069325764974, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.7253860235214233, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.1829543337225914, "step": 7631 }, { "epoch": 0.1139623261335384, "grad_norm": 0.306640625, "grad_norm_var": 0.00047898292541503906, "learning_rate": 0.0001, "loss": 1.3954, "loss/crossentropy": 2.541337490081787, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18055128306150436, "step": 7632 }, { "epoch": 0.11397725830415338, "grad_norm": 0.333984375, "grad_norm_var": 0.000453948974609375, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.6061023473739624, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.1687927395105362, "step": 7633 }, { "epoch": 0.11399219047476837, "grad_norm": 0.328125, "grad_norm_var": 0.00045800209045410156, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 2.683804154396057, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18994466215372086, "step": 7634 }, { "epoch": 0.11400712264538335, "grad_norm": 0.3515625, "grad_norm_var": 0.00038089752197265623, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.429241418838501, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1791899874806404, "step": 7635 }, { "epoch": 0.11402205481599832, "grad_norm": 0.3984375, "grad_norm_var": 0.0006319522857666015, "learning_rate": 0.0001, "loss": 1.7375, "loss/crossentropy": 2.7774120569229126, "loss/fcd": 1.45703125, "loss/idx": 10.0, "loss/logits": 0.2805045545101166, "step": 7636 }, { "epoch": 0.11403698698661331, "grad_norm": 0.3671875, "grad_norm_var": 0.000682830810546875, "learning_rate": 0.0001, "loss": 1.6521, "loss/crossentropy": 2.58572518825531, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.2302740141749382, "step": 7637 }, { "epoch": 0.11405191915722829, "grad_norm": 0.330078125, "grad_norm_var": 0.0006526788075764974, "learning_rate": 0.0001, "loss": 1.3968, "loss/crossentropy": 2.634220838546753, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.17809873074293137, "step": 7638 }, { "epoch": 0.11406685132784328, "grad_norm": 0.328125, "grad_norm_var": 0.0006408055623372395, "learning_rate": 0.0001, "loss": 1.3697, "loss/crossentropy": 2.7457058429718018, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17827415466308594, "step": 7639 }, { "epoch": 0.11408178349845825, "grad_norm": 0.35546875, "grad_norm_var": 0.0005953470865885417, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.597440481185913, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1842612326145172, "step": 7640 }, { "epoch": 0.11409671566907323, "grad_norm": 0.349609375, "grad_norm_var": 0.0005848566691080729, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.606862425804138, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20354649424552917, "step": 7641 }, { "epoch": 0.11411164783968822, "grad_norm": 0.322265625, "grad_norm_var": 0.0006108442942301432, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.463210105895996, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18956931680440903, "step": 7642 }, { "epoch": 0.1141265800103032, "grad_norm": 0.357421875, "grad_norm_var": 0.00059814453125, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.6108521223068237, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.1796787604689598, "step": 7643 }, { "epoch": 0.11414151218091818, "grad_norm": 0.40625, "grad_norm_var": 0.0008198420206705729, "learning_rate": 0.0001, "loss": 1.7476, "loss/crossentropy": 2.7022918462753296, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.2593133673071861, "step": 7644 }, { "epoch": 0.11415644435153316, "grad_norm": 0.3515625, "grad_norm_var": 0.0007882277170817057, "learning_rate": 0.0001, "loss": 1.5602, "loss/crossentropy": 2.8521987199783325, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.27505212277173996, "step": 7645 }, { "epoch": 0.11417137652214814, "grad_norm": 0.3046875, "grad_norm_var": 0.0008959293365478516, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.5430914163589478, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.18536732345819473, "step": 7646 }, { "epoch": 0.11418630869276312, "grad_norm": 0.314453125, "grad_norm_var": 0.000861215591430664, "learning_rate": 0.0001, "loss": 1.4388, "loss/crossentropy": 2.855678081512451, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19664417952299118, "step": 7647 }, { "epoch": 0.1142012408633781, "grad_norm": 0.431640625, "grad_norm_var": 0.0012131849924723306, "learning_rate": 0.0001, "loss": 1.444, "loss/crossentropy": 2.551337242126465, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19395722448825836, "step": 7648 }, { "epoch": 0.11421617303399309, "grad_norm": 0.330078125, "grad_norm_var": 0.0012234846750895183, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.587975263595581, "loss/fcd": 1.140625, "loss/idx": 10.0, "loss/logits": 0.1546219065785408, "step": 7649 }, { "epoch": 0.11423110520460807, "grad_norm": 0.421875, "grad_norm_var": 0.0014783064524332681, "learning_rate": 0.0001, "loss": 1.736, "loss/crossentropy": 2.6216869354248047, "loss/fcd": 1.49609375, "loss/idx": 10.0, "loss/logits": 0.23986878246068954, "step": 7650 }, { "epoch": 0.11424603737522306, "grad_norm": 0.41796875, "grad_norm_var": 0.0017009576161702473, "learning_rate": 0.0001, "loss": 1.6334, "loss/crossentropy": 2.765339493751526, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.2388574555516243, "step": 7651 }, { "epoch": 0.11426096954583803, "grad_norm": 0.453125, "grad_norm_var": 0.0021557966868082684, "learning_rate": 0.0001, "loss": 1.7888, "loss/crossentropy": 2.380789041519165, "loss/fcd": 1.546875, "loss/idx": 10.0, "loss/logits": 0.24191300570964813, "step": 7652 }, { "epoch": 0.11427590171645301, "grad_norm": 0.361328125, "grad_norm_var": 0.0021563212076822916, "learning_rate": 0.0001, "loss": 1.4735, "loss/crossentropy": 2.5463061332702637, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.18837622553110123, "step": 7653 }, { "epoch": 0.114290833887068, "grad_norm": 0.314453125, "grad_norm_var": 0.002243804931640625, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.537954807281494, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18675274401903152, "step": 7654 }, { "epoch": 0.11430576605768297, "grad_norm": 0.326171875, "grad_norm_var": 0.0022533257802327474, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.6740520000457764, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19079481810331345, "step": 7655 }, { "epoch": 0.11432069822829796, "grad_norm": 0.373046875, "grad_norm_var": 0.0022534688313802084, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.725187301635742, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19064431637525558, "step": 7656 }, { "epoch": 0.11433563039891294, "grad_norm": 0.3203125, "grad_norm_var": 0.0023662408192952475, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.7049354314804077, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19493040442466736, "step": 7657 }, { "epoch": 0.11435056256952791, "grad_norm": 0.326171875, "grad_norm_var": 0.0023460229237874347, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.5547120571136475, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1802694872021675, "step": 7658 }, { "epoch": 0.1143654947401429, "grad_norm": 0.36328125, "grad_norm_var": 0.0023436864217122396, "learning_rate": 0.0001, "loss": 1.56, "loss/crossentropy": 2.6695884466171265, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.22407027333974838, "step": 7659 }, { "epoch": 0.11438042691075788, "grad_norm": 0.38671875, "grad_norm_var": 0.0022562662760416668, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.3968054056167603, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21169056743383408, "step": 7660 }, { "epoch": 0.11439535908137287, "grad_norm": 0.341796875, "grad_norm_var": 0.0022762139638264974, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.7718487977981567, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17600619792938232, "step": 7661 }, { "epoch": 0.11441029125198784, "grad_norm": 0.32421875, "grad_norm_var": 0.002151600519816081, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.713175415992737, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17061498016119003, "step": 7662 }, { "epoch": 0.11442522342260282, "grad_norm": 0.337890625, "grad_norm_var": 0.002034489313761393, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 2.835126519203186, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.17442604154348373, "step": 7663 }, { "epoch": 0.11444015559321781, "grad_norm": 0.33984375, "grad_norm_var": 0.0017379124959309895, "learning_rate": 0.0001, "loss": 1.4838, "loss/crossentropy": 2.492437243461609, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19084959477186203, "step": 7664 }, { "epoch": 0.11445508776383279, "grad_norm": 0.384765625, "grad_norm_var": 0.0017165501912434896, "learning_rate": 0.0001, "loss": 1.6471, "loss/crossentropy": 2.3365681171417236, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.21736811101436615, "step": 7665 }, { "epoch": 0.11447001993444778, "grad_norm": 0.43359375, "grad_norm_var": 0.0018185933430989584, "learning_rate": 0.0001, "loss": 1.7263, "loss/crossentropy": 2.381702184677124, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.21454493701457977, "step": 7666 }, { "epoch": 0.11448495210506275, "grad_norm": 0.345703125, "grad_norm_var": 0.0016133467356363933, "learning_rate": 0.0001, "loss": 1.556, "loss/crossentropy": 2.7123149633407593, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21611309051513672, "step": 7667 }, { "epoch": 0.11449988427567774, "grad_norm": 0.33984375, "grad_norm_var": 0.0009827772776285807, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.6967110633850098, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21586640179157257, "step": 7668 }, { "epoch": 0.11451481644629272, "grad_norm": 0.326171875, "grad_norm_var": 0.0010125319163004558, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.7218239307403564, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18590611219406128, "step": 7669 }, { "epoch": 0.11452974861690769, "grad_norm": 0.32421875, "grad_norm_var": 0.0009735107421875, "learning_rate": 0.0001, "loss": 1.4545, "loss/crossentropy": 2.626486659049988, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19281607866287231, "step": 7670 }, { "epoch": 0.11454468078752268, "grad_norm": 0.35546875, "grad_norm_var": 0.0009356021881103515, "learning_rate": 0.0001, "loss": 1.4081, "loss/crossentropy": 2.83123779296875, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.1737566441297531, "step": 7671 }, { "epoch": 0.11455961295813766, "grad_norm": 0.337890625, "grad_norm_var": 0.0009115695953369141, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.6111382246017456, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.2286744862794876, "step": 7672 }, { "epoch": 0.11457454512875265, "grad_norm": 0.376953125, "grad_norm_var": 0.0008935928344726562, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.749969005584717, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.14985291659832, "step": 7673 }, { "epoch": 0.11458947729936762, "grad_norm": 0.353515625, "grad_norm_var": 0.0008433024088541667, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.66794490814209, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.213243767619133, "step": 7674 }, { "epoch": 0.1146044094699826, "grad_norm": 0.51953125, "grad_norm_var": 0.002552286783854167, "learning_rate": 0.0001, "loss": 1.5755, "loss/crossentropy": 2.5508344173431396, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.22785235941410065, "step": 7675 }, { "epoch": 0.11461934164059759, "grad_norm": 0.31640625, "grad_norm_var": 0.0026507059733072917, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.4364633560180664, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.16881967335939407, "step": 7676 }, { "epoch": 0.11463427381121256, "grad_norm": 0.337890625, "grad_norm_var": 0.0026610692342122397, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.3357324600219727, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.20293943583965302, "step": 7677 }, { "epoch": 0.11464920598182755, "grad_norm": 0.390625, "grad_norm_var": 0.0026232401529947915, "learning_rate": 0.0001, "loss": 1.6272, "loss/crossentropy": 2.3262957334518433, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.23263265192508698, "step": 7678 }, { "epoch": 0.11466413815244253, "grad_norm": 0.341796875, "grad_norm_var": 0.0026107152303059896, "learning_rate": 0.0001, "loss": 1.6079, "loss/crossentropy": 2.6131008863449097, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.25238627195358276, "step": 7679 }, { "epoch": 0.1146790703230575, "grad_norm": 0.345703125, "grad_norm_var": 0.0025939782460530597, "learning_rate": 0.0001, "loss": 1.568, "loss/crossentropy": 2.5357168912887573, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.21643568575382233, "step": 7680 }, { "epoch": 0.1146940024936725, "grad_norm": 0.37109375, "grad_norm_var": 0.002568499247233073, "learning_rate": 0.0001, "loss": 1.6097, "loss/crossentropy": 2.898231267929077, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21902897208929062, "step": 7681 }, { "epoch": 0.11470893466428747, "grad_norm": 0.326171875, "grad_norm_var": 0.0022861321767171225, "learning_rate": 0.0001, "loss": 1.401, "loss/crossentropy": 2.6848158836364746, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18611286580562592, "step": 7682 }, { "epoch": 0.11472386683490246, "grad_norm": 0.32421875, "grad_norm_var": 0.0023468017578125, "learning_rate": 0.0001, "loss": 1.4309, "loss/crossentropy": 2.7196210622787476, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1848321557044983, "step": 7683 }, { "epoch": 0.11473879900551744, "grad_norm": 0.341796875, "grad_norm_var": 0.002342971165974935, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.6033250093460083, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.21310831606388092, "step": 7684 }, { "epoch": 0.11475373117613241, "grad_norm": 0.3515625, "grad_norm_var": 0.002283668518066406, "learning_rate": 0.0001, "loss": 1.6858, "loss/crossentropy": 2.462214946746826, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.2522353082895279, "step": 7685 }, { "epoch": 0.1147686633467474, "grad_norm": 0.330078125, "grad_norm_var": 0.0022600650787353515, "learning_rate": 0.0001, "loss": 1.5308, "loss/crossentropy": 2.596477746963501, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2339383289217949, "step": 7686 }, { "epoch": 0.11478359551736238, "grad_norm": 0.333984375, "grad_norm_var": 0.0022948582967122397, "learning_rate": 0.0001, "loss": 1.3823, "loss/crossentropy": 2.736386775970459, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17916233092546463, "step": 7687 }, { "epoch": 0.11479852768797737, "grad_norm": 0.494140625, "grad_norm_var": 0.0034392674763997397, "learning_rate": 0.0001, "loss": 1.7167, "loss/crossentropy": 2.783638119697571, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.2479494959115982, "step": 7688 }, { "epoch": 0.11481345985859234, "grad_norm": 0.337890625, "grad_norm_var": 0.0034774144490559897, "learning_rate": 0.0001, "loss": 1.5427, "loss/crossentropy": 2.5621421337127686, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.1950080692768097, "step": 7689 }, { "epoch": 0.11482839202920733, "grad_norm": 0.34765625, "grad_norm_var": 0.003487380345662435, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.571321725845337, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18970520049333572, "step": 7690 }, { "epoch": 0.11484332419982231, "grad_norm": 0.3515625, "grad_norm_var": 0.0017486413319905598, "learning_rate": 0.0001, "loss": 1.4889, "loss/crossentropy": 2.5891835689544678, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.19983303546905518, "step": 7691 }, { "epoch": 0.11485825637043728, "grad_norm": 0.30859375, "grad_norm_var": 0.0017902215321858725, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.5983372926712036, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17837008833885193, "step": 7692 }, { "epoch": 0.11487318854105227, "grad_norm": 0.37890625, "grad_norm_var": 0.0018172581990559896, "learning_rate": 0.0001, "loss": 1.6054, "loss/crossentropy": 2.5235577821731567, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.2264648675918579, "step": 7693 }, { "epoch": 0.11488812071166725, "grad_norm": 0.400390625, "grad_norm_var": 0.0018699487050374349, "learning_rate": 0.0001, "loss": 1.5592, "loss/crossentropy": 2.8010886907577515, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.21540245413780212, "step": 7694 }, { "epoch": 0.11490305288228224, "grad_norm": 0.365234375, "grad_norm_var": 0.0018619378407796225, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.631449341773987, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20164059102535248, "step": 7695 }, { "epoch": 0.11491798505289721, "grad_norm": 0.427734375, "grad_norm_var": 0.002161010106404622, "learning_rate": 0.0001, "loss": 1.6299, "loss/crossentropy": 2.5852681398391724, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.219709575176239, "step": 7696 }, { "epoch": 0.11493291722351219, "grad_norm": 0.283203125, "grad_norm_var": 0.002536519368489583, "learning_rate": 0.0001, "loss": 1.3157, "loss/crossentropy": 2.55092716217041, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.16331718862056732, "step": 7697 }, { "epoch": 0.11494784939412718, "grad_norm": 0.341796875, "grad_norm_var": 0.00248870849609375, "learning_rate": 0.0001, "loss": 1.4738, "loss/crossentropy": 2.6379916667938232, "loss/fcd": 1.28515625, "loss/idx": 10.0, "loss/logits": 0.18862104415893555, "step": 7698 }, { "epoch": 0.11496278156474216, "grad_norm": 0.3203125, "grad_norm_var": 0.0025069554646809894, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.5167839527130127, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2078711986541748, "step": 7699 }, { "epoch": 0.11497771373535715, "grad_norm": 0.306640625, "grad_norm_var": 0.0026563008626302085, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.773556709289551, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20700568705797195, "step": 7700 }, { "epoch": 0.11499264590597212, "grad_norm": 0.376953125, "grad_norm_var": 0.0026850223541259764, "learning_rate": 0.0001, "loss": 1.6009, "loss/crossentropy": 2.510080575942993, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.225904680788517, "step": 7701 }, { "epoch": 0.1150075780765871, "grad_norm": 0.423828125, "grad_norm_var": 0.0029032230377197266, "learning_rate": 0.0001, "loss": 1.4862, "loss/crossentropy": 2.6565375328063965, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.20493543148040771, "step": 7702 }, { "epoch": 0.11502251024720209, "grad_norm": 0.306640625, "grad_norm_var": 0.003053649266560872, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.675613045692444, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.20295748114585876, "step": 7703 }, { "epoch": 0.11503744241781706, "grad_norm": 0.3203125, "grad_norm_var": 0.001849810282389323, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.8055825233459473, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17754733562469482, "step": 7704 }, { "epoch": 0.11505237458843205, "grad_norm": 0.330078125, "grad_norm_var": 0.0018660863240559896, "learning_rate": 0.0001, "loss": 1.5, "loss/crossentropy": 2.5722914934158325, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.20699862390756607, "step": 7705 }, { "epoch": 0.11506730675904703, "grad_norm": 0.353515625, "grad_norm_var": 0.0018668969472249348, "learning_rate": 0.0001, "loss": 1.3822, "loss/crossentropy": 2.551601529121399, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.17904410511255264, "step": 7706 }, { "epoch": 0.115082238929662, "grad_norm": 0.32421875, "grad_norm_var": 0.0019069512685139973, "learning_rate": 0.0001, "loss": 1.5287, "loss/crossentropy": 2.4558480978012085, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20842983573675156, "step": 7707 }, { "epoch": 0.115097171100277, "grad_norm": 0.3359375, "grad_norm_var": 0.0018099308013916015, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.6562154293060303, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20377454906702042, "step": 7708 }, { "epoch": 0.11511210327089197, "grad_norm": 0.3671875, "grad_norm_var": 0.001772928237915039, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.6276724338531494, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.17423655092716217, "step": 7709 }, { "epoch": 0.11512703544150696, "grad_norm": 0.44140625, "grad_norm_var": 0.00215911865234375, "learning_rate": 0.0001, "loss": 1.8153, "loss/crossentropy": 2.393542766571045, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.3230624943971634, "step": 7710 }, { "epoch": 0.11514196761212193, "grad_norm": 0.427734375, "grad_norm_var": 0.0025171915690104167, "learning_rate": 0.0001, "loss": 1.759, "loss/crossentropy": 2.5153708457946777, "loss/fcd": 1.5078125, "loss/idx": 10.0, "loss/logits": 0.251209557056427, "step": 7711 }, { "epoch": 0.11515689978273692, "grad_norm": 0.345703125, "grad_norm_var": 0.0021473566691080728, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.5969409942626953, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.23755469173192978, "step": 7712 }, { "epoch": 0.1151718319533519, "grad_norm": 0.353515625, "grad_norm_var": 0.001826922098795573, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.6199915409088135, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18639564514160156, "step": 7713 }, { "epoch": 0.11518676412396688, "grad_norm": 0.326171875, "grad_norm_var": 0.0018691380818684896, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.5415087938308716, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19443099945783615, "step": 7714 }, { "epoch": 0.11520169629458187, "grad_norm": 0.359375, "grad_norm_var": 0.001790301005045573, "learning_rate": 0.0001, "loss": 1.4891, "loss/crossentropy": 2.4884963035583496, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.21176989376544952, "step": 7715 }, { "epoch": 0.11521662846519684, "grad_norm": 0.337890625, "grad_norm_var": 0.0016448338826497396, "learning_rate": 0.0001, "loss": 1.537, "loss/crossentropy": 2.658874988555908, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21281729638576508, "step": 7716 }, { "epoch": 0.11523156063581183, "grad_norm": 0.357421875, "grad_norm_var": 0.001619720458984375, "learning_rate": 0.0001, "loss": 1.4986, "loss/crossentropy": 2.8077584505081177, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19784045219421387, "step": 7717 }, { "epoch": 0.1152464928064268, "grad_norm": 0.36328125, "grad_norm_var": 0.0013088067372639975, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.6025590896606445, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1726655289530754, "step": 7718 }, { "epoch": 0.11526142497704178, "grad_norm": 0.404296875, "grad_norm_var": 0.001299269994099935, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.3662655353546143, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.24618270993232727, "step": 7719 }, { "epoch": 0.11527635714765677, "grad_norm": 0.341796875, "grad_norm_var": 0.001216570536295573, "learning_rate": 0.0001, "loss": 1.4751, "loss/crossentropy": 2.7876858711242676, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20162209123373032, "step": 7720 }, { "epoch": 0.11529128931827175, "grad_norm": 0.353515625, "grad_norm_var": 0.001155535380045573, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.593433380126953, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20390678942203522, "step": 7721 }, { "epoch": 0.11530622148888674, "grad_norm": 0.3125, "grad_norm_var": 0.0013074080149332683, "learning_rate": 0.0001, "loss": 1.4331, "loss/crossentropy": 2.5225906372070312, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.1830604448914528, "step": 7722 }, { "epoch": 0.11532115365950171, "grad_norm": 0.345703125, "grad_norm_var": 0.001235198974609375, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.658289074897766, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.18489614129066467, "step": 7723 }, { "epoch": 0.11533608583011669, "grad_norm": 0.427734375, "grad_norm_var": 0.001457071304321289, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.5758947134017944, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20076710730791092, "step": 7724 }, { "epoch": 0.11535101800073168, "grad_norm": 0.51171875, "grad_norm_var": 0.0027744134267171225, "learning_rate": 0.0001, "loss": 1.5893, "loss/crossentropy": 2.5563353300094604, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.23384343832731247, "step": 7725 }, { "epoch": 0.11536595017134665, "grad_norm": 0.337890625, "grad_norm_var": 0.0025360107421875, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.6571189165115356, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.17967627942562103, "step": 7726 }, { "epoch": 0.11538088234196164, "grad_norm": 0.34765625, "grad_norm_var": 0.0023111820220947264, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.6022270917892456, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2184845507144928, "step": 7727 }, { "epoch": 0.11539581451257662, "grad_norm": 0.341796875, "grad_norm_var": 0.002321736017862956, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.4706766605377197, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.1829346939921379, "step": 7728 }, { "epoch": 0.11541074668319161, "grad_norm": 0.376953125, "grad_norm_var": 0.0023236433664957684, "learning_rate": 0.0001, "loss": 1.49, "loss/crossentropy": 2.6260135173797607, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20094988495111465, "step": 7729 }, { "epoch": 0.11542567885380658, "grad_norm": 0.3828125, "grad_norm_var": 0.002228228251139323, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.5838420391082764, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21529077738523483, "step": 7730 }, { "epoch": 0.11544061102442156, "grad_norm": 0.3203125, "grad_norm_var": 0.002373186747233073, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.6770251989364624, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.19364836066961288, "step": 7731 }, { "epoch": 0.11545554319503655, "grad_norm": 0.36328125, "grad_norm_var": 0.002316776911417643, "learning_rate": 0.0001, "loss": 1.5383, "loss/crossentropy": 2.4293025732040405, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.21794313192367554, "step": 7732 }, { "epoch": 0.11547047536565153, "grad_norm": 0.390625, "grad_norm_var": 0.002338663736979167, "learning_rate": 0.0001, "loss": 1.6883, "loss/crossentropy": 2.599344491958618, "loss/fcd": 1.43359375, "loss/idx": 10.0, "loss/logits": 0.25473253428936005, "step": 7733 }, { "epoch": 0.11548540753626652, "grad_norm": 0.37109375, "grad_norm_var": 0.002335357666015625, "learning_rate": 0.0001, "loss": 1.5494, "loss/crossentropy": 2.613500714302063, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.21341344714164734, "step": 7734 }, { "epoch": 0.11550033970688149, "grad_norm": 0.37109375, "grad_norm_var": 0.002255105972290039, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.805259108543396, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21382057666778564, "step": 7735 }, { "epoch": 0.11551527187749647, "grad_norm": 0.34375, "grad_norm_var": 0.002248382568359375, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.7234771251678467, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1884269267320633, "step": 7736 }, { "epoch": 0.11553020404811146, "grad_norm": 0.42578125, "grad_norm_var": 0.002428929011027018, "learning_rate": 0.0001, "loss": 1.52, "loss/crossentropy": 2.3536499738693237, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.19971713423728943, "step": 7737 }, { "epoch": 0.11554513621872643, "grad_norm": 0.380859375, "grad_norm_var": 0.0021680196126302085, "learning_rate": 0.0001, "loss": 1.5485, "loss/crossentropy": 2.6757086515426636, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.2164231389760971, "step": 7738 }, { "epoch": 0.11556006838934142, "grad_norm": 0.328125, "grad_norm_var": 0.0022617181142171225, "learning_rate": 0.0001, "loss": 1.548, "loss/crossentropy": 2.514896869659424, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20424316823482513, "step": 7739 }, { "epoch": 0.1155750005599564, "grad_norm": 0.380859375, "grad_norm_var": 0.0020778497060139974, "learning_rate": 0.0001, "loss": 1.4876, "loss/crossentropy": 2.471389412879944, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19068226218223572, "step": 7740 }, { "epoch": 0.11558993273057137, "grad_norm": 0.33984375, "grad_norm_var": 0.0007546583811442058, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.5950037240982056, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.22741705179214478, "step": 7741 }, { "epoch": 0.11560486490118636, "grad_norm": 0.373046875, "grad_norm_var": 0.0007157484690348307, "learning_rate": 0.0001, "loss": 1.4742, "loss/crossentropy": 2.5612025260925293, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.2007574662566185, "step": 7742 }, { "epoch": 0.11561979707180134, "grad_norm": 0.388671875, "grad_norm_var": 0.000726763407389323, "learning_rate": 0.0001, "loss": 1.6493, "loss/crossentropy": 2.811813473701477, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.2430395781993866, "step": 7743 }, { "epoch": 0.11563472924241633, "grad_norm": 0.3515625, "grad_norm_var": 0.0006993452707926433, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.634255051612854, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1860281154513359, "step": 7744 }, { "epoch": 0.1156496614130313, "grad_norm": 0.349609375, "grad_norm_var": 0.0007135868072509766, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.626253604888916, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.19815891236066818, "step": 7745 }, { "epoch": 0.11566459358364628, "grad_norm": 0.3359375, "grad_norm_var": 0.0007479190826416016, "learning_rate": 0.0001, "loss": 1.4538, "loss/crossentropy": 2.5313998460769653, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19991926103830338, "step": 7746 }, { "epoch": 0.11567952575426127, "grad_norm": 0.33984375, "grad_norm_var": 0.0006595452626546224, "learning_rate": 0.0001, "loss": 1.47, "loss/crossentropy": 2.5468595027923584, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1887022778391838, "step": 7747 }, { "epoch": 0.11569445792487625, "grad_norm": 0.291015625, "grad_norm_var": 0.0009988784790039063, "learning_rate": 0.0001, "loss": 1.3619, "loss/crossentropy": 2.57601797580719, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.17444271594285965, "step": 7748 }, { "epoch": 0.11570939009549124, "grad_norm": 0.328125, "grad_norm_var": 0.0009887059529622396, "learning_rate": 0.0001, "loss": 1.5969, "loss/crossentropy": 2.3641257286071777, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2414417266845703, "step": 7749 }, { "epoch": 0.11572432226610621, "grad_norm": 0.357421875, "grad_norm_var": 0.0009732405344645182, "learning_rate": 0.0001, "loss": 1.6891, "loss/crossentropy": 2.309657335281372, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.24765976518392563, "step": 7750 }, { "epoch": 0.1157392544367212, "grad_norm": 0.443359375, "grad_norm_var": 0.0014513651529947916, "learning_rate": 0.0001, "loss": 1.693, "loss/crossentropy": 2.402813196182251, "loss/fcd": 1.4609375, "loss/idx": 10.0, "loss/logits": 0.23205309361219406, "step": 7751 }, { "epoch": 0.11575418660733618, "grad_norm": 0.33203125, "grad_norm_var": 0.001485125223795573, "learning_rate": 0.0001, "loss": 1.3715, "loss/crossentropy": 2.8358232975006104, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.17620746046304703, "step": 7752 }, { "epoch": 0.11576911877795115, "grad_norm": 0.36328125, "grad_norm_var": 0.0011738459269205728, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.483711004257202, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18452757596969604, "step": 7753 }, { "epoch": 0.11578405094856614, "grad_norm": 0.341796875, "grad_norm_var": 0.0011356989542643228, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.6167296171188354, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.18866796046495438, "step": 7754 }, { "epoch": 0.11579898311918112, "grad_norm": 0.359375, "grad_norm_var": 0.0010939915974934897, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.6967437267303467, "loss/fcd": 1.2109375, "loss/idx": 10.0, "loss/logits": 0.17325028777122498, "step": 7755 }, { "epoch": 0.11581391528979611, "grad_norm": 0.361328125, "grad_norm_var": 0.0010498046875, "learning_rate": 0.0001, "loss": 1.4647, "loss/crossentropy": 3.009069561958313, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19906962662935257, "step": 7756 }, { "epoch": 0.11582884746041108, "grad_norm": 0.349609375, "grad_norm_var": 0.0010379632314046225, "learning_rate": 0.0001, "loss": 1.4868, "loss/crossentropy": 2.4460922479629517, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.18997105211019516, "step": 7757 }, { "epoch": 0.11584377963102606, "grad_norm": 0.40234375, "grad_norm_var": 0.0011655171712239584, "learning_rate": 0.0001, "loss": 1.5738, "loss/crossentropy": 2.6610080003738403, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.19491208344697952, "step": 7758 }, { "epoch": 0.11585871180164105, "grad_norm": 0.361328125, "grad_norm_var": 0.0010929743448893229, "learning_rate": 0.0001, "loss": 1.5643, "loss/crossentropy": 2.5532522201538086, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.23230475187301636, "step": 7759 }, { "epoch": 0.11587364397225602, "grad_norm": 0.359375, "grad_norm_var": 0.0010939915974934897, "learning_rate": 0.0001, "loss": 1.5034, "loss/crossentropy": 2.6058353185653687, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.21046286821365356, "step": 7760 }, { "epoch": 0.11588857614287101, "grad_norm": 0.380859375, "grad_norm_var": 0.0011336644490559896, "learning_rate": 0.0001, "loss": 1.5365, "loss/crossentropy": 2.477532148361206, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21230067312717438, "step": 7761 }, { "epoch": 0.11590350831348599, "grad_norm": 0.34375, "grad_norm_var": 0.0011158625284830729, "learning_rate": 0.0001, "loss": 1.5324, "loss/crossentropy": 2.569028854370117, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2120775282382965, "step": 7762 }, { "epoch": 0.11591844048410097, "grad_norm": 0.3046875, "grad_norm_var": 0.0012743631998697917, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.4713571071624756, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17494460940361023, "step": 7763 }, { "epoch": 0.11593337265471595, "grad_norm": 0.31640625, "grad_norm_var": 0.0010981082916259766, "learning_rate": 0.0001, "loss": 1.3606, "loss/crossentropy": 2.709122896194458, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.17701813578605652, "step": 7764 }, { "epoch": 0.11594830482533093, "grad_norm": 0.359375, "grad_norm_var": 0.00104063351949056, "learning_rate": 0.0001, "loss": 1.5291, "loss/crossentropy": 2.8300766944885254, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21664541959762573, "step": 7765 }, { "epoch": 0.11596323699594592, "grad_norm": 0.41796875, "grad_norm_var": 0.0012608846028645834, "learning_rate": 0.0001, "loss": 1.7424, "loss/crossentropy": 2.5015822649002075, "loss/fcd": 1.48046875, "loss/idx": 10.0, "loss/logits": 0.2619401663541794, "step": 7766 }, { "epoch": 0.1159781691665609, "grad_norm": 0.36328125, "grad_norm_var": 0.0007962385813395182, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.71457576751709, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17008076608181, "step": 7767 }, { "epoch": 0.11599310133717587, "grad_norm": 0.3125, "grad_norm_var": 0.0008858839670817057, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.6803970336914062, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.21011513471603394, "step": 7768 }, { "epoch": 0.11600803350779086, "grad_norm": 0.396484375, "grad_norm_var": 0.0009866714477539062, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.6979212760925293, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19639449566602707, "step": 7769 }, { "epoch": 0.11602296567840584, "grad_norm": 0.421875, "grad_norm_var": 0.0012128035227457683, "learning_rate": 0.0001, "loss": 1.5252, "loss/crossentropy": 2.5702801942825317, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.212710402905941, "step": 7770 }, { "epoch": 0.11603789784902083, "grad_norm": 0.333984375, "grad_norm_var": 0.0012659072875976563, "learning_rate": 0.0001, "loss": 1.4482, "loss/crossentropy": 2.5477352142333984, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.19038254022598267, "step": 7771 }, { "epoch": 0.1160528300196358, "grad_norm": 0.37890625, "grad_norm_var": 0.001284646987915039, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.489807367324829, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.17882335931062698, "step": 7772 }, { "epoch": 0.11606776219025079, "grad_norm": 0.35546875, "grad_norm_var": 0.0012765884399414062, "learning_rate": 0.0001, "loss": 1.5195, "loss/crossentropy": 2.7082818746566772, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.21093281358480453, "step": 7773 }, { "epoch": 0.11608269436086577, "grad_norm": 0.322265625, "grad_norm_var": 0.0012576897939046224, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.6023809909820557, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20796114206314087, "step": 7774 }, { "epoch": 0.11609762653148074, "grad_norm": 0.3671875, "grad_norm_var": 0.0012624104817708333, "learning_rate": 0.0001, "loss": 1.456, "loss/crossentropy": 2.713046073913574, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19430501759052277, "step": 7775 }, { "epoch": 0.11611255870209573, "grad_norm": 0.3125, "grad_norm_var": 0.0013936360677083333, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.619749426841736, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.19069599360227585, "step": 7776 }, { "epoch": 0.11612749087271071, "grad_norm": 0.333984375, "grad_norm_var": 0.0013722737630208334, "learning_rate": 0.0001, "loss": 1.5955, "loss/crossentropy": 2.576670289039612, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.2204936370253563, "step": 7777 }, { "epoch": 0.1161424230433257, "grad_norm": 0.33984375, "grad_norm_var": 0.0013778050740559896, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.6040427684783936, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.23851389437913895, "step": 7778 }, { "epoch": 0.11615735521394067, "grad_norm": 0.38671875, "grad_norm_var": 0.0012776692708333333, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.501134991645813, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.1865081936120987, "step": 7779 }, { "epoch": 0.11617228738455565, "grad_norm": 0.326171875, "grad_norm_var": 0.0012302239735921224, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.8038222789764404, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.19342872500419617, "step": 7780 }, { "epoch": 0.11618721955517064, "grad_norm": 0.4140625, "grad_norm_var": 0.0014269351959228516, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.5407122373580933, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.19194739311933517, "step": 7781 }, { "epoch": 0.11620215172578562, "grad_norm": 0.345703125, "grad_norm_var": 0.0012087504069010417, "learning_rate": 0.0001, "loss": 1.5259, "loss/crossentropy": 2.6103967428207397, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21337047964334488, "step": 7782 }, { "epoch": 0.1162170838964006, "grad_norm": 0.330078125, "grad_norm_var": 0.001249551773071289, "learning_rate": 0.0001, "loss": 1.581, "loss/crossentropy": 2.4800407886505127, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.22167325019836426, "step": 7783 }, { "epoch": 0.11623201606701558, "grad_norm": 0.41015625, "grad_norm_var": 0.0012940565745035808, "learning_rate": 0.0001, "loss": 1.563, "loss/crossentropy": 2.7394890785217285, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2075086608529091, "step": 7784 }, { "epoch": 0.11624694823763056, "grad_norm": 0.33984375, "grad_norm_var": 0.0012262980143229166, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.741067886352539, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.19363657385110855, "step": 7785 }, { "epoch": 0.11626188040824555, "grad_norm": 0.359375, "grad_norm_var": 0.0009333292643229166, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.599622368812561, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19715556502342224, "step": 7786 }, { "epoch": 0.11627681257886052, "grad_norm": 0.298828125, "grad_norm_var": 0.001102129618326823, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.4960758686065674, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1896408498287201, "step": 7787 }, { "epoch": 0.11629174474947551, "grad_norm": 0.3515625, "grad_norm_var": 0.00104827880859375, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.6972267627716064, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.18579844385385513, "step": 7788 }, { "epoch": 0.11630667692009049, "grad_norm": 0.337890625, "grad_norm_var": 0.0010538578033447265, "learning_rate": 0.0001, "loss": 1.5505, "loss/crossentropy": 2.402885675430298, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.21066150814294815, "step": 7789 }, { "epoch": 0.11632160909070548, "grad_norm": 0.384765625, "grad_norm_var": 0.0010792891184488933, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.6102616786956787, "loss/fcd": 1.3359375, "loss/idx": 10.0, "loss/logits": 0.2096867337822914, "step": 7790 }, { "epoch": 0.11633654126132045, "grad_norm": 0.36328125, "grad_norm_var": 0.0010725498199462891, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.428825855255127, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.20554640889167786, "step": 7791 }, { "epoch": 0.11635147343193543, "grad_norm": 0.380859375, "grad_norm_var": 0.0010030110677083333, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.7566529512405396, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20559152960777283, "step": 7792 }, { "epoch": 0.11636640560255042, "grad_norm": 0.3515625, "grad_norm_var": 0.0009696801503499349, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.6644622087478638, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18075784295797348, "step": 7793 }, { "epoch": 0.1163813377731654, "grad_norm": 0.361328125, "grad_norm_var": 0.0009478251139322917, "learning_rate": 0.0001, "loss": 1.549, "loss/crossentropy": 2.581592321395874, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.22085721045732498, "step": 7794 }, { "epoch": 0.11639626994378038, "grad_norm": 0.33984375, "grad_norm_var": 0.0009112040201822917, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.6199324131011963, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18721570819616318, "step": 7795 }, { "epoch": 0.11641120211439536, "grad_norm": 0.34375, "grad_norm_var": 0.0008607069651285807, "learning_rate": 0.0001, "loss": 1.5352, "loss/crossentropy": 2.5896342992782593, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.20708854496479034, "step": 7796 }, { "epoch": 0.11642613428501034, "grad_norm": 0.380859375, "grad_norm_var": 0.0006772359212239583, "learning_rate": 0.0001, "loss": 1.5535, "loss/crossentropy": 2.4381648302078247, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22142651677131653, "step": 7797 }, { "epoch": 0.11644106645562532, "grad_norm": 0.35546875, "grad_norm_var": 0.0006711165110270183, "learning_rate": 0.0001, "loss": 1.4769, "loss/crossentropy": 2.5066052675247192, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.1956494003534317, "step": 7798 }, { "epoch": 0.1164559986262403, "grad_norm": 0.50390625, "grad_norm_var": 0.0019683202107747396, "learning_rate": 0.0001, "loss": 1.9066, "loss/crossentropy": 2.675329089164734, "loss/fcd": 1.62109375, "loss/idx": 10.0, "loss/logits": 0.2855129539966583, "step": 7799 }, { "epoch": 0.11647093079685529, "grad_norm": 0.404296875, "grad_norm_var": 0.00193632443745931, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.573494553565979, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.200072281062603, "step": 7800 }, { "epoch": 0.11648586296747027, "grad_norm": 0.357421875, "grad_norm_var": 0.0018941243489583334, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.7288215160369873, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1998104676604271, "step": 7801 }, { "epoch": 0.11650079513808524, "grad_norm": 0.396484375, "grad_norm_var": 0.001941537857055664, "learning_rate": 0.0001, "loss": 1.6136, "loss/crossentropy": 2.6289602518081665, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.2229844257235527, "step": 7802 }, { "epoch": 0.11651572730870023, "grad_norm": 0.33984375, "grad_norm_var": 0.00166015625, "learning_rate": 0.0001, "loss": 1.3628, "loss/crossentropy": 2.445866107940674, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16748791933059692, "step": 7803 }, { "epoch": 0.11653065947931521, "grad_norm": 0.341796875, "grad_norm_var": 0.001692819595336914, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.485512852668762, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1891861855983734, "step": 7804 }, { "epoch": 0.1165455916499302, "grad_norm": 0.361328125, "grad_norm_var": 0.0016222476959228515, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.5429493188858032, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1917177215218544, "step": 7805 }, { "epoch": 0.11656052382054517, "grad_norm": 0.34375, "grad_norm_var": 0.0016626358032226563, "learning_rate": 0.0001, "loss": 1.535, "loss/crossentropy": 2.462319254875183, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.22247696667909622, "step": 7806 }, { "epoch": 0.11657545599116015, "grad_norm": 0.40234375, "grad_norm_var": 0.0017211278279622396, "learning_rate": 0.0001, "loss": 1.8642, "loss/crossentropy": 2.4546478986740112, "loss/fcd": 1.6015625, "loss/idx": 10.0, "loss/logits": 0.26267974078655243, "step": 7807 }, { "epoch": 0.11659038816177514, "grad_norm": 0.412109375, "grad_norm_var": 0.0018157323201497396, "learning_rate": 0.0001, "loss": 1.5063, "loss/crossentropy": 2.6060672998428345, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.19770404696464539, "step": 7808 }, { "epoch": 0.11660532033239011, "grad_norm": 0.365234375, "grad_norm_var": 0.001785135269165039, "learning_rate": 0.0001, "loss": 1.5506, "loss/crossentropy": 2.624576210975647, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.20680511742830276, "step": 7809 }, { "epoch": 0.1166202525030051, "grad_norm": 0.404296875, "grad_norm_var": 0.001818704605102539, "learning_rate": 0.0001, "loss": 1.4974, "loss/crossentropy": 2.651792883872986, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.18877308815717697, "step": 7810 }, { "epoch": 0.11663518467362008, "grad_norm": 0.33984375, "grad_norm_var": 0.001818704605102539, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.636184334754944, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17319496721029282, "step": 7811 }, { "epoch": 0.11665011684423507, "grad_norm": 0.38671875, "grad_norm_var": 0.0017361799875895182, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.8885613679885864, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.19183270633220673, "step": 7812 }, { "epoch": 0.11666504901485004, "grad_norm": 0.345703125, "grad_norm_var": 0.0018139998118082681, "learning_rate": 0.0001, "loss": 1.5329, "loss/crossentropy": 2.542256712913513, "loss/fcd": 1.31640625, "loss/idx": 10.0, "loss/logits": 0.21651146560907364, "step": 7813 }, { "epoch": 0.11667998118546502, "grad_norm": 0.34765625, "grad_norm_var": 0.0018421014149983723, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.56398868560791, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.18743300437927246, "step": 7814 }, { "epoch": 0.11669491335608001, "grad_norm": 0.333984375, "grad_norm_var": 0.0008008321126302083, "learning_rate": 0.0001, "loss": 1.2929, "loss/crossentropy": 2.4707478284835815, "loss/fcd": 1.14453125, "loss/idx": 10.0, "loss/logits": 0.1483321338891983, "step": 7815 }, { "epoch": 0.11670984552669499, "grad_norm": 0.40625, "grad_norm_var": 0.0008106072743733724, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.481953263282776, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21017303317785263, "step": 7816 }, { "epoch": 0.11672477769730998, "grad_norm": 0.376953125, "grad_norm_var": 0.0008074283599853515, "learning_rate": 0.0001, "loss": 1.602, "loss/crossentropy": 2.3128855228424072, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.21136663854122162, "step": 7817 }, { "epoch": 0.11673970986792495, "grad_norm": 0.478515625, "grad_norm_var": 0.0015284061431884766, "learning_rate": 0.0001, "loss": 1.4248, "loss/crossentropy": 2.7644124031066895, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18261933326721191, "step": 7818 }, { "epoch": 0.11675464203853993, "grad_norm": 0.400390625, "grad_norm_var": 0.0014806111653645833, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.4979844093322754, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.2443959265947342, "step": 7819 }, { "epoch": 0.11676957420915492, "grad_norm": 0.30859375, "grad_norm_var": 0.0017094771067301432, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.6634175777435303, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18131599575281143, "step": 7820 }, { "epoch": 0.11678450637976989, "grad_norm": 0.31640625, "grad_norm_var": 0.001922607421875, "learning_rate": 0.0001, "loss": 1.4523, "loss/crossentropy": 2.46902072429657, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.19062252342700958, "step": 7821 }, { "epoch": 0.11679943855038488, "grad_norm": 0.41796875, "grad_norm_var": 0.0019769668579101562, "learning_rate": 0.0001, "loss": 1.5864, "loss/crossentropy": 2.664587616920471, "loss/fcd": 1.375, "loss/idx": 10.0, "loss/logits": 0.21135097742080688, "step": 7822 }, { "epoch": 0.11681437072099986, "grad_norm": 0.416015625, "grad_norm_var": 0.0020335992177327473, "learning_rate": 0.0001, "loss": 1.655, "loss/crossentropy": 2.4667367935180664, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.23707162588834763, "step": 7823 }, { "epoch": 0.11682930289161483, "grad_norm": 0.353515625, "grad_norm_var": 0.0019859155019124348, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.484792113304138, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.19394510239362717, "step": 7824 }, { "epoch": 0.11684423506222982, "grad_norm": 0.404296875, "grad_norm_var": 0.0020310560862223308, "learning_rate": 0.0001, "loss": 1.5316, "loss/crossentropy": 2.593080997467041, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.2112480252981186, "step": 7825 }, { "epoch": 0.1168591672328448, "grad_norm": 0.361328125, "grad_norm_var": 0.001991891860961914, "learning_rate": 0.0001, "loss": 1.6261, "loss/crossentropy": 2.7661505937576294, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.23153140395879745, "step": 7826 }, { "epoch": 0.11687409940345979, "grad_norm": 0.375, "grad_norm_var": 0.0019060611724853516, "learning_rate": 0.0001, "loss": 1.6121, "loss/crossentropy": 2.342878222465515, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.22146665304899216, "step": 7827 }, { "epoch": 0.11688903157407476, "grad_norm": 0.328125, "grad_norm_var": 0.0020433902740478516, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.3992077112197876, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.22551808506250381, "step": 7828 }, { "epoch": 0.11690396374468974, "grad_norm": 0.384765625, "grad_norm_var": 0.001995706558227539, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.5281022787094116, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.19119607657194138, "step": 7829 }, { "epoch": 0.11691889591530473, "grad_norm": 0.357421875, "grad_norm_var": 0.0019652684529622394, "learning_rate": 0.0001, "loss": 1.5434, "loss/crossentropy": 2.6225247383117676, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.21915461868047714, "step": 7830 }, { "epoch": 0.1169338280859197, "grad_norm": 0.310546875, "grad_norm_var": 0.0021315892537434895, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.5954490900039673, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19593732804059982, "step": 7831 }, { "epoch": 0.1169487602565347, "grad_norm": 0.49609375, "grad_norm_var": 0.003013356526692708, "learning_rate": 0.0001, "loss": 1.7341, "loss/crossentropy": 2.448954701423645, "loss/fcd": 1.51171875, "loss/idx": 10.0, "loss/logits": 0.2223784476518631, "step": 7832 }, { "epoch": 0.11696369242714967, "grad_norm": 0.3203125, "grad_norm_var": 0.0032396793365478517, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.5854440927505493, "loss/fcd": 1.23828125, "loss/idx": 10.0, "loss/logits": 0.18575040996074677, "step": 7833 }, { "epoch": 0.11697862459776466, "grad_norm": 0.33984375, "grad_norm_var": 0.0025614420572916668, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.491189479827881, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.18276380002498627, "step": 7834 }, { "epoch": 0.11699355676837964, "grad_norm": 0.34375, "grad_norm_var": 0.0025185743967692056, "learning_rate": 0.0001, "loss": 1.5291, "loss/crossentropy": 2.5614043474197388, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20491857081651688, "step": 7835 }, { "epoch": 0.11700848893899461, "grad_norm": 0.359375, "grad_norm_var": 0.002300373713175456, "learning_rate": 0.0001, "loss": 1.5804, "loss/crossentropy": 2.705709457397461, "loss/fcd": 1.34765625, "loss/idx": 10.0, "loss/logits": 0.23278887569904327, "step": 7836 }, { "epoch": 0.1170234211096096, "grad_norm": 0.3359375, "grad_norm_var": 0.002190383275349935, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.6533087491989136, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.18113309144973755, "step": 7837 }, { "epoch": 0.11703835328022458, "grad_norm": 0.326171875, "grad_norm_var": 0.002117919921875, "learning_rate": 0.0001, "loss": 1.4062, "loss/crossentropy": 2.6091235876083374, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17959512770175934, "step": 7838 }, { "epoch": 0.11705328545083957, "grad_norm": 0.322265625, "grad_norm_var": 0.002008056640625, "learning_rate": 0.0001, "loss": 1.4658, "loss/crossentropy": 2.3742077350616455, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1884610429406166, "step": 7839 }, { "epoch": 0.11706821762145454, "grad_norm": 0.353515625, "grad_norm_var": 0.002008056640625, "learning_rate": 0.0001, "loss": 1.6198, "loss/crossentropy": 2.5658949613571167, "loss/fcd": 1.39453125, "loss/idx": 10.0, "loss/logits": 0.22528325021266937, "step": 7840 }, { "epoch": 0.11708314979206952, "grad_norm": 0.423828125, "grad_norm_var": 0.002153968811035156, "learning_rate": 0.0001, "loss": 1.577, "loss/crossentropy": 2.6731780767440796, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20976881682872772, "step": 7841 }, { "epoch": 0.11709808196268451, "grad_norm": 0.35546875, "grad_norm_var": 0.0021540164947509766, "learning_rate": 0.0001, "loss": 1.562, "loss/crossentropy": 2.6752654314041138, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.22992228716611862, "step": 7842 }, { "epoch": 0.11711301413329948, "grad_norm": 0.37109375, "grad_norm_var": 0.002146259943644206, "learning_rate": 0.0001, "loss": 1.6399, "loss/crossentropy": 2.6153167486190796, "loss/fcd": 1.41796875, "loss/idx": 10.0, "loss/logits": 0.22193030267953873, "step": 7843 }, { "epoch": 0.11712794630391447, "grad_norm": 0.40625, "grad_norm_var": 0.002216196060180664, "learning_rate": 0.0001, "loss": 1.7252, "loss/crossentropy": 2.3930569887161255, "loss/fcd": 1.484375, "loss/idx": 10.0, "loss/logits": 0.24080801010131836, "step": 7844 }, { "epoch": 0.11714287847452945, "grad_norm": 0.33984375, "grad_norm_var": 0.002211443583170573, "learning_rate": 0.0001, "loss": 1.4682, "loss/crossentropy": 2.43769633769989, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.20262408256530762, "step": 7845 }, { "epoch": 0.11715781064514443, "grad_norm": 0.341796875, "grad_norm_var": 0.0022322972615559894, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.6693681478500366, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.20212507992982864, "step": 7846 }, { "epoch": 0.11717274281575941, "grad_norm": 0.302734375, "grad_norm_var": 0.0022867202758789064, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.477627396583557, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17469681799411774, "step": 7847 }, { "epoch": 0.11718767498637439, "grad_norm": 0.34765625, "grad_norm_var": 0.0009434382120768229, "learning_rate": 0.0001, "loss": 1.4228, "loss/crossentropy": 2.718114733695984, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.18060291558504105, "step": 7848 }, { "epoch": 0.11720260715698938, "grad_norm": 0.330078125, "grad_norm_var": 0.0009115695953369141, "learning_rate": 0.0001, "loss": 1.5945, "loss/crossentropy": 2.6313337087631226, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.22735070437192917, "step": 7849 }, { "epoch": 0.11721753932760436, "grad_norm": 0.349609375, "grad_norm_var": 0.0009043375651041667, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.7108110189437866, "loss/fcd": 1.2734375, "loss/idx": 10.0, "loss/logits": 0.20497766137123108, "step": 7850 }, { "epoch": 0.11723247149821935, "grad_norm": 0.359375, "grad_norm_var": 0.0009053548177083334, "learning_rate": 0.0001, "loss": 1.5969, "loss/crossentropy": 2.4258124828338623, "loss/fcd": 1.37109375, "loss/idx": 10.0, "loss/logits": 0.22582397609949112, "step": 7851 }, { "epoch": 0.11724740366883432, "grad_norm": 0.53515625, "grad_norm_var": 0.0030196507771809897, "learning_rate": 0.0001, "loss": 1.678, "loss/crossentropy": 2.3212146759033203, "loss/fcd": 1.4453125, "loss/idx": 10.0, "loss/logits": 0.23271331191062927, "step": 7852 }, { "epoch": 0.1172623358394493, "grad_norm": 0.376953125, "grad_norm_var": 0.0029792626698811847, "learning_rate": 0.0001, "loss": 1.6425, "loss/crossentropy": 2.5278948545455933, "loss/fcd": 1.421875, "loss/idx": 10.0, "loss/logits": 0.22058489173650742, "step": 7853 }, { "epoch": 0.11727726801006429, "grad_norm": 0.439453125, "grad_norm_var": 0.0031931400299072266, "learning_rate": 0.0001, "loss": 1.6154, "loss/crossentropy": 2.399264693260193, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.209121473133564, "step": 7854 }, { "epoch": 0.11729220018067926, "grad_norm": 0.318359375, "grad_norm_var": 0.0032200972239176434, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.531600594520569, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.16113601624965668, "step": 7855 }, { "epoch": 0.11730713235129425, "grad_norm": 0.3359375, "grad_norm_var": 0.003282610575358073, "learning_rate": 0.0001, "loss": 1.4003, "loss/crossentropy": 2.6599862575531006, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17761167883872986, "step": 7856 }, { "epoch": 0.11732206452190923, "grad_norm": 0.494140625, "grad_norm_var": 0.004088274637858073, "learning_rate": 0.0001, "loss": 1.6228, "loss/crossentropy": 2.9875433444976807, "loss/fcd": 1.41015625, "loss/idx": 10.0, "loss/logits": 0.2126188948750496, "step": 7857 }, { "epoch": 0.1173369966925242, "grad_norm": 0.400390625, "grad_norm_var": 0.0040959517161051435, "learning_rate": 0.0001, "loss": 1.4084, "loss/crossentropy": 2.5915180444717407, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1661764159798622, "step": 7858 }, { "epoch": 0.1173519288631392, "grad_norm": 0.353515625, "grad_norm_var": 0.0041315714518229164, "learning_rate": 0.0001, "loss": 1.4785, "loss/crossentropy": 2.5558621883392334, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.20119287073612213, "step": 7859 }, { "epoch": 0.11736686103375417, "grad_norm": 0.396484375, "grad_norm_var": 0.0040993849436442055, "learning_rate": 0.0001, "loss": 1.584, "loss/crossentropy": 2.640007972717285, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.2285642921924591, "step": 7860 }, { "epoch": 0.11738179320436916, "grad_norm": 0.3984375, "grad_norm_var": 0.004028813044230143, "learning_rate": 0.0001, "loss": 1.4259, "loss/crossentropy": 2.722649931907654, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.17590487003326416, "step": 7861 }, { "epoch": 0.11739672537498413, "grad_norm": 0.30859375, "grad_norm_var": 0.004266866048177083, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.4815951585769653, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17704185843467712, "step": 7862 }, { "epoch": 0.11741165754559911, "grad_norm": 0.361328125, "grad_norm_var": 0.0038939793904622395, "learning_rate": 0.0001, "loss": 1.3893, "loss/crossentropy": 2.5924949645996094, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.1705915853381157, "step": 7863 }, { "epoch": 0.1174265897162141, "grad_norm": 0.369140625, "grad_norm_var": 0.0038256168365478514, "learning_rate": 0.0001, "loss": 1.4434, "loss/crossentropy": 2.54180371761322, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.19341447949409485, "step": 7864 }, { "epoch": 0.11744152188682908, "grad_norm": 0.306640625, "grad_norm_var": 0.004025125503540039, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.685887336730957, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1830444037914276, "step": 7865 }, { "epoch": 0.11745645405744407, "grad_norm": 0.384765625, "grad_norm_var": 0.003953027725219727, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.6996039152145386, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18528051674365997, "step": 7866 }, { "epoch": 0.11747138622805904, "grad_norm": 0.33984375, "grad_norm_var": 0.004040129979451497, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.5315572023391724, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.17133381217718124, "step": 7867 }, { "epoch": 0.11748631839867402, "grad_norm": 0.35546875, "grad_norm_var": 0.0023994286855061847, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.704077363014221, "loss/fcd": 1.17578125, "loss/idx": 10.0, "loss/logits": 0.17006303369998932, "step": 7868 }, { "epoch": 0.117501250569289, "grad_norm": 0.384765625, "grad_norm_var": 0.002409219741821289, "learning_rate": 0.0001, "loss": 1.6398, "loss/crossentropy": 2.3539472818374634, "loss/fcd": 1.390625, "loss/idx": 10.0, "loss/logits": 0.24918177723884583, "step": 7869 }, { "epoch": 0.11751618273990398, "grad_norm": 0.375, "grad_norm_var": 0.002086639404296875, "learning_rate": 0.0001, "loss": 1.3698, "loss/crossentropy": 2.597172260284424, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16671540588140488, "step": 7870 }, { "epoch": 0.11753111491051897, "grad_norm": 0.43359375, "grad_norm_var": 0.0021588484446207683, "learning_rate": 0.0001, "loss": 1.657, "loss/crossentropy": 2.865199089050293, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.22731474041938782, "step": 7871 }, { "epoch": 0.11754604708113395, "grad_norm": 0.314453125, "grad_norm_var": 0.0022992451985677084, "learning_rate": 0.0001, "loss": 1.3661, "loss/crossentropy": 2.6101412773132324, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.17473848909139633, "step": 7872 }, { "epoch": 0.11756097925174894, "grad_norm": 0.3515625, "grad_norm_var": 0.001277017593383789, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.762803792953491, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.20620019733905792, "step": 7873 }, { "epoch": 0.11757591142236391, "grad_norm": 0.7109375, "grad_norm_var": 0.008785438537597657, "learning_rate": 0.0001, "loss": 1.6709, "loss/crossentropy": 3.107850432395935, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.24120120704174042, "step": 7874 }, { "epoch": 0.11759084359297889, "grad_norm": 0.369140625, "grad_norm_var": 0.008737119038899739, "learning_rate": 0.0001, "loss": 1.5717, "loss/crossentropy": 2.5082499980926514, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.23181000351905823, "step": 7875 }, { "epoch": 0.11760577576359388, "grad_norm": 0.365234375, "grad_norm_var": 0.008750343322753906, "learning_rate": 0.0001, "loss": 1.4854, "loss/crossentropy": 2.6845768690109253, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.19238994270563126, "step": 7876 }, { "epoch": 0.11762070793420885, "grad_norm": 0.38671875, "grad_norm_var": 0.008734893798828126, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.4636791944503784, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19731394201517105, "step": 7877 }, { "epoch": 0.11763564010482384, "grad_norm": 0.32421875, "grad_norm_var": 0.008596547444661458, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.5353755950927734, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20799855887889862, "step": 7878 }, { "epoch": 0.11765057227543882, "grad_norm": 0.357421875, "grad_norm_var": 0.00860894521077474, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.7095896005630493, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.17982713133096695, "step": 7879 }, { "epoch": 0.1176655044460538, "grad_norm": 0.41015625, "grad_norm_var": 0.00863798459370931, "learning_rate": 0.0001, "loss": 1.677, "loss/crossentropy": 2.6695363521575928, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.23558732867240906, "step": 7880 }, { "epoch": 0.11768043661666878, "grad_norm": 0.345703125, "grad_norm_var": 0.008322000503540039, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.5702210664749146, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.2073303535580635, "step": 7881 }, { "epoch": 0.11769536878728376, "grad_norm": 0.427734375, "grad_norm_var": 0.008418512344360352, "learning_rate": 0.0001, "loss": 1.7665, "loss/crossentropy": 2.2062264680862427, "loss/fcd": 1.48828125, "loss/idx": 10.0, "loss/logits": 0.2782369703054428, "step": 7882 }, { "epoch": 0.11771030095789875, "grad_norm": 0.380859375, "grad_norm_var": 0.008245277404785156, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.6222342252731323, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.1994193121790886, "step": 7883 }, { "epoch": 0.11772523312851373, "grad_norm": 0.376953125, "grad_norm_var": 0.008165725072224935, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.5804349184036255, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.18133056163787842, "step": 7884 }, { "epoch": 0.1177401652991287, "grad_norm": 0.3515625, "grad_norm_var": 0.008278401692708333, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.5266374349594116, "loss/fcd": 1.30859375, "loss/idx": 10.0, "loss/logits": 0.20798218250274658, "step": 7885 }, { "epoch": 0.11775509746974369, "grad_norm": 0.390625, "grad_norm_var": 0.008257039388020833, "learning_rate": 0.0001, "loss": 1.5548, "loss/crossentropy": 2.6099050045013428, "loss/fcd": 1.34375, "loss/idx": 10.0, "loss/logits": 0.2110181525349617, "step": 7886 }, { "epoch": 0.11777002964035867, "grad_norm": 0.333984375, "grad_norm_var": 0.008345397313435872, "learning_rate": 0.0001, "loss": 1.5424, "loss/crossentropy": 2.647817611694336, "loss/fcd": 1.33203125, "loss/idx": 10.0, "loss/logits": 0.210376538336277, "step": 7887 }, { "epoch": 0.11778496181097366, "grad_norm": 0.47265625, "grad_norm_var": 0.00837243398030599, "learning_rate": 0.0001, "loss": 1.4561, "loss/crossentropy": 2.5752400159835815, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.17875535041093826, "step": 7888 }, { "epoch": 0.11779989398158863, "grad_norm": 0.353515625, "grad_norm_var": 0.008360783259073893, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.63405978679657, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.1838691607117653, "step": 7889 }, { "epoch": 0.11781482615220361, "grad_norm": 0.34765625, "grad_norm_var": 0.0014191786448160806, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.44867742061615, "loss/fcd": 1.2265625, "loss/idx": 10.0, "loss/logits": 0.17273256182670593, "step": 7890 }, { "epoch": 0.1178297583228186, "grad_norm": 0.447265625, "grad_norm_var": 0.0017434279123942056, "learning_rate": 0.0001, "loss": 1.7318, "loss/crossentropy": 2.4477192163467407, "loss/fcd": 1.4921875, "loss/idx": 10.0, "loss/logits": 0.23963838070631027, "step": 7891 }, { "epoch": 0.11784469049343357, "grad_norm": 0.41015625, "grad_norm_var": 0.0017840067545572917, "learning_rate": 0.0001, "loss": 1.5941, "loss/crossentropy": 2.5242031812667847, "loss/fcd": 1.38671875, "loss/idx": 10.0, "loss/logits": 0.20739063620567322, "step": 7892 }, { "epoch": 0.11785962266404856, "grad_norm": 0.328125, "grad_norm_var": 0.0019642512003580728, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.789105772972107, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21169795095920563, "step": 7893 }, { "epoch": 0.11787455483466354, "grad_norm": 0.3359375, "grad_norm_var": 0.0018877665201822917, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.550053358078003, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.18059350550174713, "step": 7894 }, { "epoch": 0.11788948700527853, "grad_norm": 0.310546875, "grad_norm_var": 0.0021624247233072917, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.5701197385787964, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.19247254729270935, "step": 7895 }, { "epoch": 0.1179044191758935, "grad_norm": 0.333984375, "grad_norm_var": 0.0021828810373942056, "learning_rate": 0.0001, "loss": 1.492, "loss/crossentropy": 2.44136905670166, "loss/fcd": 1.29296875, "loss/idx": 10.0, "loss/logits": 0.1989903673529625, "step": 7896 }, { "epoch": 0.11791935134650848, "grad_norm": 0.314453125, "grad_norm_var": 0.002352253595987956, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.466074824333191, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.17286698520183563, "step": 7897 }, { "epoch": 0.11793428351712347, "grad_norm": 0.365234375, "grad_norm_var": 0.002113199234008789, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.467971086502075, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.2014220654964447, "step": 7898 }, { "epoch": 0.11794921568773845, "grad_norm": 0.32421875, "grad_norm_var": 0.0022003173828125, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.7379837036132812, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.20576459169387817, "step": 7899 }, { "epoch": 0.11796414785835344, "grad_norm": 0.291015625, "grad_norm_var": 0.002494049072265625, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.6257550716400146, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.1783597618341446, "step": 7900 }, { "epoch": 0.11797908002896841, "grad_norm": 0.4453125, "grad_norm_var": 0.002976226806640625, "learning_rate": 0.0001, "loss": 1.6394, "loss/crossentropy": 2.4579113721847534, "loss/fcd": 1.4296875, "loss/idx": 10.0, "loss/logits": 0.20970580726861954, "step": 7901 }, { "epoch": 0.11799401219958339, "grad_norm": 0.3125, "grad_norm_var": 0.003067779541015625, "learning_rate": 0.0001, "loss": 1.4439, "loss/crossentropy": 2.404306650161743, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1861073449254036, "step": 7902 }, { "epoch": 0.11800894437019838, "grad_norm": 0.32421875, "grad_norm_var": 0.0031048933664957683, "learning_rate": 0.0001, "loss": 1.3483, "loss/crossentropy": 2.5871273279190063, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1608131378889084, "step": 7903 }, { "epoch": 0.11802387654081335, "grad_norm": 0.392578125, "grad_norm_var": 0.002274004618326823, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.675898790359497, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.21210502833127975, "step": 7904 }, { "epoch": 0.11803880871142834, "grad_norm": 0.302734375, "grad_norm_var": 0.002426910400390625, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.796049475669861, "loss/fcd": 1.265625, "loss/idx": 10.0, "loss/logits": 0.1972200945019722, "step": 7905 }, { "epoch": 0.11805374088204332, "grad_norm": 0.376953125, "grad_norm_var": 0.0024748325347900392, "learning_rate": 0.0001, "loss": 1.5883, "loss/crossentropy": 2.2396914958953857, "loss/fcd": 1.37890625, "loss/idx": 10.0, "loss/logits": 0.20935942977666855, "step": 7906 }, { "epoch": 0.1180686730526583, "grad_norm": 0.31640625, "grad_norm_var": 0.0018646240234375, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.4698067903518677, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.197445310652256, "step": 7907 }, { "epoch": 0.11808360522327328, "grad_norm": 0.328125, "grad_norm_var": 0.0015481948852539063, "learning_rate": 0.0001, "loss": 1.5153, "loss/crossentropy": 2.624740719795227, "loss/fcd": 1.296875, "loss/idx": 10.0, "loss/logits": 0.2183777317404747, "step": 7908 }, { "epoch": 0.11809853739388826, "grad_norm": 0.294921875, "grad_norm_var": 0.001659250259399414, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.573971152305603, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1827159896492958, "step": 7909 }, { "epoch": 0.11811346956450325, "grad_norm": 0.330078125, "grad_norm_var": 0.0016611099243164062, "learning_rate": 0.0001, "loss": 1.381, "loss/crossentropy": 2.6466857194900513, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16221725940704346, "step": 7910 }, { "epoch": 0.11812840173511822, "grad_norm": 0.3359375, "grad_norm_var": 0.0016179243723551432, "learning_rate": 0.0001, "loss": 1.6476, "loss/crossentropy": 2.505538582801819, "loss/fcd": 1.40625, "loss/idx": 10.0, "loss/logits": 0.24135024845600128, "step": 7911 }, { "epoch": 0.11814333390573321, "grad_norm": 0.37890625, "grad_norm_var": 0.0017272313435872397, "learning_rate": 0.0001, "loss": 1.5727, "loss/crossentropy": 2.7296544313430786, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21726958453655243, "step": 7912 }, { "epoch": 0.11815826607634819, "grad_norm": 0.359375, "grad_norm_var": 0.001702737808227539, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.4496735334396362, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20471875369548798, "step": 7913 }, { "epoch": 0.11817319824696317, "grad_norm": 0.37109375, "grad_norm_var": 0.00172271728515625, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 2.295517683029175, "loss/fcd": 1.328125, "loss/idx": 10.0, "loss/logits": 0.1920952871441841, "step": 7914 }, { "epoch": 0.11818813041757815, "grad_norm": 0.31640625, "grad_norm_var": 0.0017458597819010417, "learning_rate": 0.0001, "loss": 1.4664, "loss/crossentropy": 2.537576675415039, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.1890483722090721, "step": 7915 }, { "epoch": 0.11820306258819313, "grad_norm": 0.4921875, "grad_norm_var": 0.0029000441233317056, "learning_rate": 0.0001, "loss": 1.5703, "loss/crossentropy": 2.618733525276184, "loss/fcd": 1.3671875, "loss/idx": 10.0, "loss/logits": 0.20312673598527908, "step": 7916 }, { "epoch": 0.11821799475880812, "grad_norm": 0.373046875, "grad_norm_var": 0.002354876200358073, "learning_rate": 0.0001, "loss": 1.5388, "loss/crossentropy": 2.6055108308792114, "loss/fcd": 1.33984375, "loss/idx": 10.0, "loss/logits": 0.1990014612674713, "step": 7917 }, { "epoch": 0.1182329269294231, "grad_norm": 0.328125, "grad_norm_var": 0.002291297912597656, "learning_rate": 0.0001, "loss": 1.3727, "loss/crossentropy": 2.8172601461410522, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.16955340653657913, "step": 7918 }, { "epoch": 0.11824785910003807, "grad_norm": 0.3828125, "grad_norm_var": 0.002294158935546875, "learning_rate": 0.0001, "loss": 1.5364, "loss/crossentropy": 2.385621666908264, "loss/fcd": 1.3515625, "loss/idx": 10.0, "loss/logits": 0.18484948575496674, "step": 7919 }, { "epoch": 0.11826279127065306, "grad_norm": 0.3671875, "grad_norm_var": 0.0022071679433186848, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.6856013536453247, "loss/fcd": 1.25, "loss/idx": 10.0, "loss/logits": 0.18577535450458527, "step": 7920 }, { "epoch": 0.11827772344126804, "grad_norm": 0.36328125, "grad_norm_var": 0.0020273208618164064, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.436980128288269, "loss/fcd": 1.44140625, "loss/idx": 10.0, "loss/logits": 0.21952537447214127, "step": 7921 }, { "epoch": 0.11829265561188303, "grad_norm": 0.345703125, "grad_norm_var": 0.0020059585571289063, "learning_rate": 0.0001, "loss": 1.4536, "loss/crossentropy": 2.51260769367218, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18407603353261948, "step": 7922 }, { "epoch": 0.118307587782498, "grad_norm": 0.33203125, "grad_norm_var": 0.0019403457641601562, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.558246374130249, "loss/fcd": 1.2578125, "loss/idx": 10.0, "loss/logits": 0.1833285316824913, "step": 7923 }, { "epoch": 0.11832251995311298, "grad_norm": 0.35546875, "grad_norm_var": 0.0018847147623697917, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 2.876352548599243, "loss/fcd": 1.20703125, "loss/idx": 10.0, "loss/logits": 0.1598137468099594, "step": 7924 }, { "epoch": 0.11833745212372797, "grad_norm": 0.5234375, "grad_norm_var": 0.0032292524973551433, "learning_rate": 0.0001, "loss": 1.94, "loss/crossentropy": 2.5567837953567505, "loss/fcd": 1.68359375, "loss/idx": 10.0, "loss/logits": 0.25635771453380585, "step": 7925 }, { "epoch": 0.11835238429434294, "grad_norm": 0.357421875, "grad_norm_var": 0.0031224409739176433, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.8304331302642822, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.2070559784770012, "step": 7926 }, { "epoch": 0.11836731646495793, "grad_norm": 0.404296875, "grad_norm_var": 0.0030684789021809896, "learning_rate": 0.0001, "loss": 1.4081, "loss/crossentropy": 2.3649027347564697, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.16589860618114471, "step": 7927 }, { "epoch": 0.11838224863557291, "grad_norm": 0.3359375, "grad_norm_var": 0.0031796773274739582, "learning_rate": 0.0001, "loss": 1.4603, "loss/crossentropy": 2.6473742723464966, "loss/fcd": 1.27734375, "loss/idx": 10.0, "loss/logits": 0.18300297111272812, "step": 7928 }, { "epoch": 0.11839718080618788, "grad_norm": 0.404296875, "grad_norm_var": 0.0032092889149983725, "learning_rate": 0.0001, "loss": 1.6819, "loss/crossentropy": 2.2393224239349365, "loss/fcd": 1.46875, "loss/idx": 10.0, "loss/logits": 0.21313291043043137, "step": 7929 }, { "epoch": 0.11841211297680287, "grad_norm": 0.35546875, "grad_norm_var": 0.0032395521799723306, "learning_rate": 0.0001, "loss": 1.3865, "loss/crossentropy": 2.7127360105514526, "loss/fcd": 1.203125, "loss/idx": 10.0, "loss/logits": 0.18333156406879425, "step": 7930 }, { "epoch": 0.11842704514741785, "grad_norm": 0.37890625, "grad_norm_var": 0.002976083755493164, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.3346856832504272, "loss/fcd": 1.28125, "loss/idx": 10.0, "loss/logits": 0.17653872817754745, "step": 7931 }, { "epoch": 0.11844197731803284, "grad_norm": 0.330078125, "grad_norm_var": 0.00222015380859375, "learning_rate": 0.0001, "loss": 1.4408, "loss/crossentropy": 2.691187858581543, "loss/fcd": 1.24609375, "loss/idx": 10.0, "loss/logits": 0.19471141695976257, "step": 7932 }, { "epoch": 0.11845690948864782, "grad_norm": 0.34765625, "grad_norm_var": 0.0022538344065348307, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.6119556427001953, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20250985026359558, "step": 7933 }, { "epoch": 0.1184718416592628, "grad_norm": 0.337890625, "grad_norm_var": 0.0022059122721354166, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.5430675745010376, "loss/fcd": 1.18359375, "loss/idx": 10.0, "loss/logits": 0.1574319452047348, "step": 7934 }, { "epoch": 0.11848677382987778, "grad_norm": 0.314453125, "grad_norm_var": 0.002382262547810872, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.4584470987319946, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17978178709745407, "step": 7935 }, { "epoch": 0.11850170600049276, "grad_norm": 0.416015625, "grad_norm_var": 0.0025400161743164063, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.5635021924972534, "loss/fcd": 1.3203125, "loss/idx": 10.0, "loss/logits": 0.20512094348669052, "step": 7936 }, { "epoch": 0.11851663817110775, "grad_norm": 0.322265625, "grad_norm_var": 0.0026758670806884765, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.627357602119446, "loss/fcd": 1.19140625, "loss/idx": 10.0, "loss/logits": 0.16529157757759094, "step": 7937 }, { "epoch": 0.11853157034172272, "grad_norm": 0.30859375, "grad_norm_var": 0.0028640111287434895, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.5204449892044067, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.1841781958937645, "step": 7938 }, { "epoch": 0.11854650251233771, "grad_norm": 0.3671875, "grad_norm_var": 0.0027913411458333334, "learning_rate": 0.0001, "loss": 1.4418, "loss/crossentropy": 2.9121440649032593, "loss/fcd": 1.25390625, "loss/idx": 10.0, "loss/logits": 0.18788985908031464, "step": 7939 }, { "epoch": 0.11856143468295269, "grad_norm": 0.3203125, "grad_norm_var": 0.0029189427693684894, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.568062424659729, "loss/fcd": 1.1484375, "loss/idx": 10.0, "loss/logits": 0.1552230343222618, "step": 7940 }, { "epoch": 0.11857636685356766, "grad_norm": 0.37109375, "grad_norm_var": 0.0011311848958333334, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.6449254751205444, "loss/fcd": 1.32421875, "loss/idx": 10.0, "loss/logits": 0.20464543253183365, "step": 7941 }, { "epoch": 0.11859129902418265, "grad_norm": 0.318359375, "grad_norm_var": 0.0012112935384114584, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.2245360016822815, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.20247427374124527, "step": 7942 }, { "epoch": 0.11860623119479763, "grad_norm": 0.357421875, "grad_norm_var": 0.0010220845540364583, "learning_rate": 0.0001, "loss": 1.3188, "loss/crossentropy": 2.4591978788375854, "loss/fcd": 1.16796875, "loss/idx": 10.0, "loss/logits": 0.1507854089140892, "step": 7943 }, { "epoch": 0.11862116336541262, "grad_norm": 0.46484375, "grad_norm_var": 0.0018340428670247397, "learning_rate": 0.0001, "loss": 1.5811, "loss/crossentropy": 2.566635012626648, "loss/fcd": 1.359375, "loss/idx": 10.0, "loss/logits": 0.2217583954334259, "step": 7944 }, { "epoch": 0.1186360955360276, "grad_norm": 0.33984375, "grad_norm_var": 0.0016887505849202474, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.6190954446792603, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.21675963699817657, "step": 7945 }, { "epoch": 0.11865102770664257, "grad_norm": 0.328125, "grad_norm_var": 0.0017270247141520183, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.6315102577209473, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.18295905739068985, "step": 7946 }, { "epoch": 0.11866595987725756, "grad_norm": 0.34375, "grad_norm_var": 0.0016755263010660808, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.7407848834991455, "loss/fcd": 1.3046875, "loss/idx": 10.0, "loss/logits": 0.21426743268966675, "step": 7947 }, { "epoch": 0.11868089204787254, "grad_norm": 0.314453125, "grad_norm_var": 0.0017307122548421224, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.6398009061813354, "loss/fcd": 1.1875, "loss/idx": 10.0, "loss/logits": 0.1701505333185196, "step": 7948 }, { "epoch": 0.11869582421848753, "grad_norm": 0.341796875, "grad_norm_var": 0.0017333348592122396, "learning_rate": 0.0001, "loss": 1.302, "loss/crossentropy": 2.684629201889038, "loss/fcd": 1.15234375, "loss/idx": 10.0, "loss/logits": 0.14962586760520935, "step": 7949 }, { "epoch": 0.1187107563891025, "grad_norm": 0.3359375, "grad_norm_var": 0.0017361799875895182, "learning_rate": 0.0001, "loss": 1.4971, "loss/crossentropy": 2.6090662479400635, "loss/fcd": 1.30078125, "loss/idx": 10.0, "loss/logits": 0.19632363319396973, "step": 7950 }, { "epoch": 0.11872568855971748, "grad_norm": 0.37109375, "grad_norm_var": 0.0016850153605143228, "learning_rate": 0.0001, "loss": 1.45, "loss/crossentropy": 2.5737498998641968, "loss/fcd": 1.26171875, "loss/idx": 10.0, "loss/logits": 0.1882946640253067, "step": 7951 }, { "epoch": 0.11874062073033247, "grad_norm": 0.408203125, "grad_norm_var": 0.0016214370727539063, "learning_rate": 0.0001, "loss": 1.7276, "loss/crossentropy": 2.657151937484741, "loss/fcd": 1.46484375, "loss/idx": 10.0, "loss/logits": 0.2627495974302292, "step": 7952 }, { "epoch": 0.11875555290094744, "grad_norm": 0.30078125, "grad_norm_var": 0.0017321109771728516, "learning_rate": 0.0001, "loss": 1.3781, "loss/crossentropy": 2.5839829444885254, "loss/fcd": 1.19921875, "loss/idx": 10.0, "loss/logits": 0.17892736941576004, "step": 7953 }, { "epoch": 0.11877048507156243, "grad_norm": 0.322265625, "grad_norm_var": 0.0016692479451497395, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.734254002571106, "loss/fcd": 1.22265625, "loss/idx": 10.0, "loss/logits": 0.17960232496261597, "step": 7954 }, { "epoch": 0.11878541724217741, "grad_norm": 0.31640625, "grad_norm_var": 0.0017163594563802083, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.577030301094055, "loss/fcd": 1.26953125, "loss/idx": 10.0, "loss/logits": 0.18746916949748993, "step": 7955 }, { "epoch": 0.1188003494127924, "grad_norm": 0.333984375, "grad_norm_var": 0.0016790866851806641, "learning_rate": 0.0001, "loss": 1.4924, "loss/crossentropy": 2.5724011659622192, "loss/fcd": 1.2890625, "loss/idx": 10.0, "loss/logits": 0.20336024463176727, "step": 7956 }, { "epoch": 0.11881528158340737, "grad_norm": 0.34375, "grad_norm_var": 0.001641702651977539, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.6942394971847534, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.19169186055660248, "step": 7957 }, { "epoch": 0.11883021375402235, "grad_norm": 0.3125, "grad_norm_var": 0.0016656875610351562, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.541856288909912, "loss/fcd": 1.21875, "loss/idx": 10.0, "loss/logits": 0.16422511637210846, "step": 7958 }, { "epoch": 0.11884514592463734, "grad_norm": 0.369140625, "grad_norm_var": 0.00169219970703125, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.6938729286193848, "loss/fcd": 1.2421875, "loss/idx": 10.0, "loss/logits": 0.1823524534702301, "step": 7959 }, { "epoch": 0.11886007809525231, "grad_norm": 0.349609375, "grad_norm_var": 0.0007065931955973307, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.6034282445907593, "loss/fcd": 1.23046875, "loss/idx": 10.0, "loss/logits": 0.17675704509019852, "step": 7960 }, { "epoch": 0.1188750102658673, "grad_norm": 0.326171875, "grad_norm_var": 0.0007176081339518229, "learning_rate": 0.0001, "loss": 1.4181, "loss/crossentropy": 2.606317639350891, "loss/fcd": 1.234375, "loss/idx": 10.0, "loss/logits": 0.18371620029211044, "step": 7961 }, { "epoch": 0.11888994243648228, "grad_norm": 0.546875, "grad_norm_var": 0.0034021377563476563, "learning_rate": 0.0001, "loss": 1.8411, "loss/crossentropy": 2.6887789964675903, "loss/fcd": 1.58984375, "loss/idx": 10.0, "loss/logits": 0.25124641507864, "step": 7962 }, { "epoch": 0.11890487460709726, "grad_norm": 0.388671875, "grad_norm_var": 0.003477080663045247, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.657323479652405, "loss/fcd": 1.35546875, "loss/idx": 10.0, "loss/logits": 0.21401910483837128, "step": 7963 }, { "epoch": 0.11891980677771224, "grad_norm": 0.31640625, "grad_norm_var": 0.0034667332967122395, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.769134759902954, "loss/fcd": 1.21484375, "loss/idx": 10.0, "loss/logits": 0.17666351050138474, "step": 7964 }, { "epoch": 0.11893473894832722, "grad_norm": 0.328125, "grad_norm_var": 0.0035028934478759767, "learning_rate": 0.0001, "loss": 1.3609, "loss/crossentropy": 2.4753655195236206, "loss/fcd": 1.1953125, "loss/idx": 10.0, "loss/logits": 0.16559947282075882, "step": 7965 }, { "epoch": 0.11894967111894221, "grad_norm": 0.322265625, "grad_norm_var": 0.0035481770833333333, "learning_rate": 0.0001, "loss": 1.515, "loss/crossentropy": 2.3917824029922485, "loss/fcd": 1.3125, "loss/idx": 10.0, "loss/logits": 0.20254884660243988, "step": 7966 }, { "epoch": 0.11896460328955719, "grad_norm": 0.205078125, "grad_norm_var": 0.004881652196248373, "learning_rate": 0.0001, "loss": 1.2106, "loss/crossentropy": 2.364420533180237, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.14419487863779068, "step": 7967 }, { "epoch": 0.11897953546017216, "grad_norm": 0.2412109375, "grad_norm_var": 0.005175872643788656, "learning_rate": 0.0001, "loss": 1.2421, "loss/crossentropy": 2.746548891067505, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1639568954706192, "step": 7968 }, { "epoch": 0.11899446763078715, "grad_norm": 0.232421875, "grad_norm_var": 0.005758885542551676, "learning_rate": 0.0001, "loss": 1.3039, "loss/crossentropy": 2.71278178691864, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17886710911989212, "step": 7969 }, { "epoch": 0.11900939980140213, "grad_norm": 0.240234375, "grad_norm_var": 0.006246880690256754, "learning_rate": 0.0001, "loss": 1.4854, "loss/crossentropy": 2.320678472518921, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.23931176960468292, "step": 7970 }, { "epoch": 0.11902433197201712, "grad_norm": 0.2177734375, "grad_norm_var": 0.006945610046386719, "learning_rate": 0.0001, "loss": 1.3525, "loss/crossentropy": 2.310398578643799, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1923314481973648, "step": 7971 }, { "epoch": 0.11903926414263209, "grad_norm": 0.287109375, "grad_norm_var": 0.006977653503417969, "learning_rate": 0.0001, "loss": 1.4675, "loss/crossentropy": 2.695884346961975, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.22137898206710815, "step": 7972 }, { "epoch": 0.11905419631324708, "grad_norm": 0.263671875, "grad_norm_var": 0.007063023249308268, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.608233690261841, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1921418458223343, "step": 7973 }, { "epoch": 0.11906912848386206, "grad_norm": 0.2578125, "grad_norm_var": 0.007225910822550456, "learning_rate": 0.0001, "loss": 1.525, "loss/crossentropy": 2.417559027671814, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.23591835796833038, "step": 7974 }, { "epoch": 0.11908406065447703, "grad_norm": 0.40234375, "grad_norm_var": 0.007575289408365885, "learning_rate": 0.0001, "loss": 2.1204, "loss/crossentropy": 2.665987730026245, "loss/fcd": 1.59375, "loss/idx": 10.5, "loss/logits": 0.5266189202666283, "step": 7975 }, { "epoch": 0.11909899282509202, "grad_norm": 0.21875, "grad_norm_var": 0.007917133967081706, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.4878379106521606, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.24188381433486938, "step": 7976 }, { "epoch": 0.119113924995707, "grad_norm": 0.2734375, "grad_norm_var": 0.007904688517252604, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.6157069206237793, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.17562073469161987, "step": 7977 }, { "epoch": 0.11912885716632199, "grad_norm": 0.3203125, "grad_norm_var": 0.0035460154215494794, "learning_rate": 0.0001, "loss": 1.7994, "loss/crossentropy": 2.534656047821045, "loss/fcd": 1.51171875, "loss/idx": 10.5, "loss/logits": 0.2877208888530731, "step": 7978 }, { "epoch": 0.11914378933693696, "grad_norm": 0.28515625, "grad_norm_var": 0.0027465661366780597, "learning_rate": 0.0001, "loss": 1.4285, "loss/crossentropy": 2.700538158416748, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21760288625955582, "step": 7979 }, { "epoch": 0.11915872150755194, "grad_norm": 0.255859375, "grad_norm_var": 0.0026475270589192707, "learning_rate": 0.0001, "loss": 1.376, "loss/crossentropy": 2.711959481239319, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.21193033456802368, "step": 7980 }, { "epoch": 0.11917365367816693, "grad_norm": 0.349609375, "grad_norm_var": 0.0028372287750244142, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.9083940982818604, "loss/fcd": 1.34765625, "loss/idx": 10.5, "loss/logits": 0.26159629225730896, "step": 7981 }, { "epoch": 0.1191885858487819, "grad_norm": 0.24609375, "grad_norm_var": 0.0027027130126953125, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.6891578435897827, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19626332074403763, "step": 7982 }, { "epoch": 0.1192035180193969, "grad_norm": 0.349609375, "grad_norm_var": 0.0027850468953450522, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.782275915145874, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.20342323184013367, "step": 7983 }, { "epoch": 0.11921845019001187, "grad_norm": 0.400390625, "grad_norm_var": 0.003596619764963786, "learning_rate": 0.0001, "loss": 1.6458, "loss/crossentropy": 2.4217931032180786, "loss/fcd": 1.4140625, "loss/idx": 10.5, "loss/logits": 0.23172461986541748, "step": 7984 }, { "epoch": 0.11923338236062685, "grad_norm": 0.26171875, "grad_norm_var": 0.0034349719683329266, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.5035535097122192, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.1928439885377884, "step": 7985 }, { "epoch": 0.11924831453124184, "grad_norm": 0.2236328125, "grad_norm_var": 0.003560956319173177, "learning_rate": 0.0001, "loss": 1.1903, "loss/crossentropy": 2.384756088256836, "loss/fcd": 1.04296875, "loss/idx": 10.5, "loss/logits": 0.1472921222448349, "step": 7986 }, { "epoch": 0.11926324670185681, "grad_norm": 0.255859375, "grad_norm_var": 0.003293319543202718, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.704224109649658, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18610435724258423, "step": 7987 }, { "epoch": 0.1192781788724718, "grad_norm": 0.259765625, "grad_norm_var": 0.0033531785011291504, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.4009376764297485, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.1938885599374771, "step": 7988 }, { "epoch": 0.11929311104308678, "grad_norm": 0.251953125, "grad_norm_var": 0.003401339054107666, "learning_rate": 0.0001, "loss": 1.4776, "loss/crossentropy": 2.6200984716415405, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.21589010208845139, "step": 7989 }, { "epoch": 0.11930804321370175, "grad_norm": 0.23828125, "grad_norm_var": 0.003504494825998942, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.520450234413147, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18804931640625, "step": 7990 }, { "epoch": 0.11932297538431674, "grad_norm": 0.302734375, "grad_norm_var": 0.002593354384104411, "learning_rate": 0.0001, "loss": 1.5724, "loss/crossentropy": 2.4857845306396484, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.24815401434898376, "step": 7991 }, { "epoch": 0.11933790755493172, "grad_norm": 0.220703125, "grad_norm_var": 0.0025774280230204264, "learning_rate": 0.0001, "loss": 1.3371, "loss/crossentropy": 2.5169718265533447, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19254283607006073, "step": 7992 }, { "epoch": 0.11935283972554671, "grad_norm": 0.2197265625, "grad_norm_var": 0.0028114954630533854, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.6391266584396362, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19313332438468933, "step": 7993 }, { "epoch": 0.11936777189616168, "grad_norm": 0.21875, "grad_norm_var": 0.002877616882324219, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.3971309661865234, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1706625521183014, "step": 7994 }, { "epoch": 0.11938270406677667, "grad_norm": 0.23046875, "grad_norm_var": 0.0029630661010742188, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.4497820138931274, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17315435409545898, "step": 7995 }, { "epoch": 0.11939763623739165, "grad_norm": 0.224609375, "grad_norm_var": 0.0030739466349283856, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.5404549837112427, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20385730266571045, "step": 7996 }, { "epoch": 0.11941256840800663, "grad_norm": 0.296875, "grad_norm_var": 0.002658955256144206, "learning_rate": 0.0001, "loss": 1.377, "loss/crossentropy": 2.7508617639541626, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.205083966255188, "step": 7997 }, { "epoch": 0.11942750057862161, "grad_norm": 0.265625, "grad_norm_var": 0.002639881769816081, "learning_rate": 0.0001, "loss": 1.4906, "loss/crossentropy": 2.7640637159347534, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2250165194272995, "step": 7998 }, { "epoch": 0.11944243274923659, "grad_norm": 0.287109375, "grad_norm_var": 0.002168893814086914, "learning_rate": 0.0001, "loss": 1.4424, "loss/crossentropy": 2.877802610397339, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.21588633954524994, "step": 7999 }, { "epoch": 0.11945736491985158, "grad_norm": 0.298828125, "grad_norm_var": 0.0009109338124593098, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.6522082090377808, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.21980702877044678, "step": 8000 }, { "epoch": 0.11947229709046656, "grad_norm": 0.2421875, "grad_norm_var": 0.0009134769439697266, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.4356123208999634, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.199943408370018, "step": 8001 }, { "epoch": 0.11948722926108153, "grad_norm": 0.2431640625, "grad_norm_var": 0.0008626143137613932, "learning_rate": 0.0001, "loss": 1.2149, "loss/crossentropy": 2.4849637746810913, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.15635249018669128, "step": 8002 }, { "epoch": 0.11950216143169652, "grad_norm": 0.2373046875, "grad_norm_var": 0.0008783936500549316, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.704202890396118, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2013060599565506, "step": 8003 }, { "epoch": 0.1195170936023115, "grad_norm": 0.21875, "grad_norm_var": 0.000943148136138916, "learning_rate": 0.0001, "loss": 1.2199, "loss/crossentropy": 2.5321218967437744, "loss/fcd": 1.064453125, "loss/idx": 10.5, "loss/logits": 0.15547509491443634, "step": 8004 }, { "epoch": 0.11953202577292649, "grad_norm": 0.30078125, "grad_norm_var": 0.0011060674985249837, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.459963321685791, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.19473715126514435, "step": 8005 }, { "epoch": 0.11954695794354146, "grad_norm": 0.30078125, "grad_norm_var": 0.001228646437327067, "learning_rate": 0.0001, "loss": 1.471, "loss/crossentropy": 2.486038088798523, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.22099748253822327, "step": 8006 }, { "epoch": 0.11956189011415644, "grad_norm": 0.28515625, "grad_norm_var": 0.0011402408281962076, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.389488101005554, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.2052294909954071, "step": 8007 }, { "epoch": 0.11957682228477143, "grad_norm": 0.25, "grad_norm_var": 0.0010572711626688639, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.868163228034973, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.22304052114486694, "step": 8008 }, { "epoch": 0.1195917544553864, "grad_norm": 0.32421875, "grad_norm_var": 0.0012133121490478516, "learning_rate": 0.0001, "loss": 1.6082, "loss/crossentropy": 2.7543224096298218, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.24885506927967072, "step": 8009 }, { "epoch": 0.1196066866260014, "grad_norm": 0.2353515625, "grad_norm_var": 0.0011302908261617026, "learning_rate": 0.0001, "loss": 1.4984, "loss/crossentropy": 2.3777823448181152, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.24063487350940704, "step": 8010 }, { "epoch": 0.11962161879661637, "grad_norm": 0.486328125, "grad_norm_var": 0.004041190942128499, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.7144612073898315, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.2361600622534752, "step": 8011 }, { "epoch": 0.11963655096723134, "grad_norm": 0.2578125, "grad_norm_var": 0.003860151767730713, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.6549034118652344, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20032457262277603, "step": 8012 }, { "epoch": 0.11965148313784633, "grad_norm": 0.267578125, "grad_norm_var": 0.003860151767730713, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.782062530517578, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19709522277116776, "step": 8013 }, { "epoch": 0.11966641530846131, "grad_norm": 0.2265625, "grad_norm_var": 0.004037217299143473, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.4494928121566772, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.22474589198827744, "step": 8014 }, { "epoch": 0.1196813474790763, "grad_norm": 0.22265625, "grad_norm_var": 0.004226044813791911, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.5341250896453857, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.21055330336093903, "step": 8015 }, { "epoch": 0.11969627964969128, "grad_norm": 0.22265625, "grad_norm_var": 0.004345063368479411, "learning_rate": 0.0001, "loss": 1.3268, "loss/crossentropy": 2.596452474594116, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1822447031736374, "step": 8016 }, { "epoch": 0.11971121182030627, "grad_norm": 0.271484375, "grad_norm_var": 0.004289750258127848, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.4679194688796997, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1812444031238556, "step": 8017 }, { "epoch": 0.11972614399092124, "grad_norm": 0.255859375, "grad_norm_var": 0.004251162211100261, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.4313234090805054, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20438385754823685, "step": 8018 }, { "epoch": 0.11974107616153622, "grad_norm": 0.25390625, "grad_norm_var": 0.004190027713775635, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.691161870956421, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.23389152437448502, "step": 8019 }, { "epoch": 0.1197560083321512, "grad_norm": 0.232421875, "grad_norm_var": 0.004101463158925374, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.593339204788208, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1840251162648201, "step": 8020 }, { "epoch": 0.11977094050276618, "grad_norm": 0.2255859375, "grad_norm_var": 0.004192336400349935, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.6285635232925415, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1750718057155609, "step": 8021 }, { "epoch": 0.11978587267338117, "grad_norm": 0.23046875, "grad_norm_var": 0.004211791356404622, "learning_rate": 0.0001, "loss": 1.2641, "loss/crossentropy": 2.617177724838257, "loss/fcd": 1.095703125, "loss/idx": 10.5, "loss/logits": 0.1683509722352028, "step": 8022 }, { "epoch": 0.11980080484399615, "grad_norm": 0.298828125, "grad_norm_var": 0.004259300231933594, "learning_rate": 0.0001, "loss": 1.5706, "loss/crossentropy": 2.385474681854248, "loss/fcd": 1.34375, "loss/idx": 10.5, "loss/logits": 0.22683145105838776, "step": 8023 }, { "epoch": 0.11981573701461112, "grad_norm": 0.23046875, "grad_norm_var": 0.004325739542643229, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.6772756576538086, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17270059883594513, "step": 8024 }, { "epoch": 0.11983066918522611, "grad_norm": 0.2158203125, "grad_norm_var": 0.00420620838801066, "learning_rate": 0.0001, "loss": 1.2119, "loss/crossentropy": 2.607297897338867, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.16107936203479767, "step": 8025 }, { "epoch": 0.11984560135584109, "grad_norm": 0.234375, "grad_norm_var": 0.004209264119466146, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.8064101934432983, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.19875632971525192, "step": 8026 }, { "epoch": 0.11986053352645608, "grad_norm": 0.23828125, "grad_norm_var": 0.0005131880442301432, "learning_rate": 0.0001, "loss": 1.5066, "loss/crossentropy": 2.6249821186065674, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.2604566514492035, "step": 8027 }, { "epoch": 0.11987546569707105, "grad_norm": 0.244140625, "grad_norm_var": 0.0004975001017252604, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.412501573562622, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18554408103227615, "step": 8028 }, { "epoch": 0.11989039786768603, "grad_norm": 0.21875, "grad_norm_var": 0.00047961870829264325, "learning_rate": 0.0001, "loss": 1.3528, "loss/crossentropy": 2.81849205493927, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.20050249993801117, "step": 8029 }, { "epoch": 0.11990533003830102, "grad_norm": 0.2333984375, "grad_norm_var": 0.0004713018735249837, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.641414999961853, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.2005954533815384, "step": 8030 }, { "epoch": 0.119920262208916, "grad_norm": 0.236328125, "grad_norm_var": 0.00045260985692342125, "learning_rate": 0.0001, "loss": 1.3814, "loss/crossentropy": 2.4730303287506104, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19785376638174057, "step": 8031 }, { "epoch": 0.11993519437953098, "grad_norm": 0.234375, "grad_norm_var": 0.0004338224728902181, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.484945774078369, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.21054290980100632, "step": 8032 }, { "epoch": 0.11995012655014596, "grad_norm": 0.2314453125, "grad_norm_var": 0.0003707726796468099, "learning_rate": 0.0001, "loss": 1.3412, "loss/crossentropy": 2.575652837753296, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1809985712170601, "step": 8033 }, { "epoch": 0.11996505872076095, "grad_norm": 0.2490234375, "grad_norm_var": 0.0003577828407287598, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.5667550563812256, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.1971781626343727, "step": 8034 }, { "epoch": 0.11997999089137593, "grad_norm": 0.224609375, "grad_norm_var": 0.00034919977188110354, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.7379181385040283, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.1623176783323288, "step": 8035 }, { "epoch": 0.1199949230619909, "grad_norm": 0.228515625, "grad_norm_var": 0.00035209258397420247, "learning_rate": 0.0001, "loss": 1.2861, "loss/crossentropy": 2.696255326271057, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.18452265113592148, "step": 8036 }, { "epoch": 0.12000985523260589, "grad_norm": 0.21484375, "grad_norm_var": 0.00037407875061035156, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.6137301921844482, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1795518845319748, "step": 8037 }, { "epoch": 0.12002478740322087, "grad_norm": 0.244140625, "grad_norm_var": 0.00037708282470703123, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.450811505317688, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.22040042281150818, "step": 8038 }, { "epoch": 0.12003971957383586, "grad_norm": 0.259765625, "grad_norm_var": 0.00014565785725911457, "learning_rate": 0.0001, "loss": 1.4128, "loss/crossentropy": 2.35595703125, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.20580196380615234, "step": 8039 }, { "epoch": 0.12005465174445083, "grad_norm": 0.2353515625, "grad_norm_var": 0.0001450816790262858, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.5503329038619995, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17400258779525757, "step": 8040 }, { "epoch": 0.12006958391506581, "grad_norm": 0.2333984375, "grad_norm_var": 0.00012190739313761393, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.795785427093506, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18822511285543442, "step": 8041 }, { "epoch": 0.1200845160856808, "grad_norm": 0.296875, "grad_norm_var": 0.00036045312881469724, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.75303053855896, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.26800140738487244, "step": 8042 }, { "epoch": 0.12009944825629577, "grad_norm": 0.259765625, "grad_norm_var": 0.0003873785336812337, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.806278347969055, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21737445145845413, "step": 8043 }, { "epoch": 0.12011438042691076, "grad_norm": 0.255859375, "grad_norm_var": 0.00040196975072224935, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.655312657356262, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.22610507905483246, "step": 8044 }, { "epoch": 0.12012931259752574, "grad_norm": 0.29296875, "grad_norm_var": 0.0005257884661356608, "learning_rate": 0.0001, "loss": 1.6349, "loss/crossentropy": 2.440369725227356, "loss/fcd": 1.3828125, "loss/idx": 10.5, "loss/logits": 0.2520742118358612, "step": 8045 }, { "epoch": 0.12014424476814071, "grad_norm": 0.275390625, "grad_norm_var": 0.0005673090616861979, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 3.162567973136902, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.17570175230503082, "step": 8046 }, { "epoch": 0.1201591769387557, "grad_norm": 0.28125, "grad_norm_var": 0.0006217797597249349, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.472326159477234, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.23524374514818192, "step": 8047 }, { "epoch": 0.12017410910937068, "grad_norm": 0.2890625, "grad_norm_var": 0.0006867567698160807, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.5684540271759033, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.21645181626081467, "step": 8048 }, { "epoch": 0.12018904127998567, "grad_norm": 0.248046875, "grad_norm_var": 0.000652913252512614, "learning_rate": 0.0001, "loss": 1.4052, "loss/crossentropy": 2.643010377883911, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.20600666850805283, "step": 8049 }, { "epoch": 0.12020397345060065, "grad_norm": 0.255859375, "grad_norm_var": 0.0006498813629150391, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.629629373550415, "loss/fcd": 1.33984375, "loss/idx": 10.5, "loss/logits": 0.25187452137470245, "step": 8050 }, { "epoch": 0.12021890562121562, "grad_norm": 0.33984375, "grad_norm_var": 0.0009977976481119792, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.475147843360901, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19949006289243698, "step": 8051 }, { "epoch": 0.12023383779183061, "grad_norm": 0.20703125, "grad_norm_var": 0.0011259555816650391, "learning_rate": 0.0001, "loss": 1.288, "loss/crossentropy": 2.340724229812622, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1669100522994995, "step": 8052 }, { "epoch": 0.12024876996244559, "grad_norm": 0.23828125, "grad_norm_var": 0.0010134220123291016, "learning_rate": 0.0001, "loss": 1.3114, "loss/crossentropy": 2.8299330472946167, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18246394395828247, "step": 8053 }, { "epoch": 0.12026370213306058, "grad_norm": 0.2080078125, "grad_norm_var": 0.0011873523394266765, "learning_rate": 0.0001, "loss": 1.3193, "loss/crossentropy": 2.5085986852645874, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.17864708602428436, "step": 8054 }, { "epoch": 0.12027863430367555, "grad_norm": 0.2333984375, "grad_norm_var": 0.0012353102366129557, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.5679949522018433, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17968431115150452, "step": 8055 }, { "epoch": 0.12029356647429054, "grad_norm": 0.2373046875, "grad_norm_var": 0.0012292861938476562, "learning_rate": 0.0001, "loss": 1.3396, "loss/crossentropy": 2.567921996116638, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1794596165418625, "step": 8056 }, { "epoch": 0.12030849864490552, "grad_norm": 0.296875, "grad_norm_var": 0.0012600223223368328, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 3.0239306688308716, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.22337963432073593, "step": 8057 }, { "epoch": 0.1203234308155205, "grad_norm": 0.26953125, "grad_norm_var": 0.0011850317319234212, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.597132086753845, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.2291560396552086, "step": 8058 }, { "epoch": 0.12033836298613548, "grad_norm": 0.224609375, "grad_norm_var": 0.0012717207272847494, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.718841791152954, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1647748500108719, "step": 8059 }, { "epoch": 0.12035329515675046, "grad_norm": 0.251953125, "grad_norm_var": 0.0012746135393778483, "learning_rate": 0.0001, "loss": 1.5583, "loss/crossentropy": 2.3759069442749023, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.24186883121728897, "step": 8060 }, { "epoch": 0.12036822732736545, "grad_norm": 0.2177734375, "grad_norm_var": 0.0012908299763997396, "learning_rate": 0.0001, "loss": 1.2616, "loss/crossentropy": 2.600662112236023, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.16787777096033096, "step": 8061 }, { "epoch": 0.12038315949798042, "grad_norm": 0.287109375, "grad_norm_var": 0.0013318379720052083, "learning_rate": 0.0001, "loss": 1.3191, "loss/crossentropy": 2.73337185382843, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1823841854929924, "step": 8062 }, { "epoch": 0.1203980916685954, "grad_norm": 0.2275390625, "grad_norm_var": 0.0013268113136291504, "learning_rate": 0.0001, "loss": 1.2564, "loss/crossentropy": 2.6318541765213013, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17824064195156097, "step": 8063 }, { "epoch": 0.12041302383921039, "grad_norm": 0.2314453125, "grad_norm_var": 0.00124967892964681, "learning_rate": 0.0001, "loss": 1.3563, "loss/crossentropy": 2.521461606025696, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1922493278980255, "step": 8064 }, { "epoch": 0.12042795600982537, "grad_norm": 0.23046875, "grad_norm_var": 0.001269849141438802, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.5979830026626587, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17509005963802338, "step": 8065 }, { "epoch": 0.12044288818044036, "grad_norm": 0.24609375, "grad_norm_var": 0.0012646834055582681, "learning_rate": 0.0001, "loss": 1.4082, "loss/crossentropy": 2.5036275386810303, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.18948617577552795, "step": 8066 }, { "epoch": 0.12045782035105533, "grad_norm": 0.240234375, "grad_norm_var": 0.0006477991739908854, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.579319715499878, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.1976538449525833, "step": 8067 }, { "epoch": 0.1204727525216703, "grad_norm": 0.2373046875, "grad_norm_var": 0.0005700707435607911, "learning_rate": 0.0001, "loss": 1.2085, "loss/crossentropy": 2.6395105123519897, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.1616077646613121, "step": 8068 }, { "epoch": 0.1204876846922853, "grad_norm": 0.25390625, "grad_norm_var": 0.0005768100420633952, "learning_rate": 0.0001, "loss": 1.3824, "loss/crossentropy": 2.769799590110779, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20658520609140396, "step": 8069 }, { "epoch": 0.12050261686290027, "grad_norm": 0.2451171875, "grad_norm_var": 0.0004880229632059733, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.6282840967178345, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21700721979141235, "step": 8070 }, { "epoch": 0.12051754903351526, "grad_norm": 0.2294921875, "grad_norm_var": 0.0004953662554423014, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.822258710861206, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.20934458822011948, "step": 8071 }, { "epoch": 0.12053248120413024, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005275368690490723, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.3610697984695435, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.16114875674247742, "step": 8072 }, { "epoch": 0.12054741337474521, "grad_norm": 0.212890625, "grad_norm_var": 0.00038127501805623374, "learning_rate": 0.0001, "loss": 1.2516, "loss/crossentropy": 2.5709999799728394, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17351547628641129, "step": 8073 }, { "epoch": 0.1205623455453602, "grad_norm": 0.224609375, "grad_norm_var": 0.00032570759455362953, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.7699161767959595, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19651755690574646, "step": 8074 }, { "epoch": 0.12057727771597518, "grad_norm": 0.251953125, "grad_norm_var": 0.00032949050267537435, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.3708438873291016, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19127117097377777, "step": 8075 }, { "epoch": 0.12059220988659017, "grad_norm": 0.2275390625, "grad_norm_var": 0.0003216425577799479, "learning_rate": 0.0001, "loss": 1.3006, "loss/crossentropy": 2.8539801836013794, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17557822167873383, "step": 8076 }, { "epoch": 0.12060714205720514, "grad_norm": 0.470703125, "grad_norm_var": 0.0036860108375549316, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.655381679534912, "loss/fcd": 1.33984375, "loss/idx": 10.5, "loss/logits": 0.23294387757778168, "step": 8077 }, { "epoch": 0.12062207422782013, "grad_norm": 0.3125, "grad_norm_var": 0.003843875726064046, "learning_rate": 0.0001, "loss": 1.5181, "loss/crossentropy": 2.549080967903137, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.23294444382190704, "step": 8078 }, { "epoch": 0.12063700639843511, "grad_norm": 0.283203125, "grad_norm_var": 0.0038413842519124347, "learning_rate": 0.0001, "loss": 1.6576, "loss/crossentropy": 2.2893768548965454, "loss/fcd": 1.38671875, "loss/idx": 10.5, "loss/logits": 0.2708342745900154, "step": 8079 }, { "epoch": 0.12065193856905009, "grad_norm": 0.30078125, "grad_norm_var": 0.0039014776547749838, "learning_rate": 0.0001, "loss": 1.5238, "loss/crossentropy": 2.4630309343338013, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.21126730740070343, "step": 8080 }, { "epoch": 0.12066687073966507, "grad_norm": 0.255859375, "grad_norm_var": 0.003835769494374593, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.6716036796569824, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2072543352842331, "step": 8081 }, { "epoch": 0.12068180291028005, "grad_norm": 0.291015625, "grad_norm_var": 0.003858435153961182, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.3214367628097534, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.22668011486530304, "step": 8082 }, { "epoch": 0.12069673508089504, "grad_norm": 0.31640625, "grad_norm_var": 0.003957617282867432, "learning_rate": 0.0001, "loss": 1.5336, "loss/crossentropy": 2.5384833812713623, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.22497430443763733, "step": 8083 }, { "epoch": 0.12071166725151002, "grad_norm": 0.291015625, "grad_norm_var": 0.0038970788319905598, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.70397686958313, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.21410076320171356, "step": 8084 }, { "epoch": 0.12072659942212499, "grad_norm": 0.220703125, "grad_norm_var": 0.004056231180826823, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.376562714576721, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2010304033756256, "step": 8085 }, { "epoch": 0.12074153159273998, "grad_norm": 0.232421875, "grad_norm_var": 0.004112176100413005, "learning_rate": 0.0001, "loss": 1.351, "loss/crossentropy": 2.4664095640182495, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.186956524848938, "step": 8086 }, { "epoch": 0.12075646376335496, "grad_norm": 0.27734375, "grad_norm_var": 0.003987757364908854, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.4514739513397217, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.1782105714082718, "step": 8087 }, { "epoch": 0.12077139593396995, "grad_norm": 0.25, "grad_norm_var": 0.0038387576738993325, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.5653003454208374, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19377927482128143, "step": 8088 }, { "epoch": 0.12078632810458492, "grad_norm": 0.23046875, "grad_norm_var": 0.003709725538889567, "learning_rate": 0.0001, "loss": 1.4454, "loss/crossentropy": 2.4942978620529175, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.22276995331048965, "step": 8089 }, { "epoch": 0.1208012602751999, "grad_norm": 0.279296875, "grad_norm_var": 0.003512569268544515, "learning_rate": 0.0001, "loss": 1.5913, "loss/crossentropy": 2.6532763242721558, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.28664954006671906, "step": 8090 }, { "epoch": 0.12081619244581489, "grad_norm": 0.26953125, "grad_norm_var": 0.0034645040829976398, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.7718230485916138, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19837529212236404, "step": 8091 }, { "epoch": 0.12083112461642986, "grad_norm": 0.28515625, "grad_norm_var": 0.003255144755045573, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.4977521896362305, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.22097352147102356, "step": 8092 }, { "epoch": 0.12084605678704485, "grad_norm": 0.298828125, "grad_norm_var": 0.0008549372355143229, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.8333715200424194, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2689314931631088, "step": 8093 }, { "epoch": 0.12086098895765983, "grad_norm": 0.2412109375, "grad_norm_var": 0.0008128762245178222, "learning_rate": 0.0001, "loss": 1.3467, "loss/crossentropy": 2.586162567138672, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.17091191560029984, "step": 8094 }, { "epoch": 0.12087592112827482, "grad_norm": 0.267578125, "grad_norm_var": 0.0008010506629943847, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.6566966772079468, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.15617681294679642, "step": 8095 }, { "epoch": 0.1208908532988898, "grad_norm": 0.2578125, "grad_norm_var": 0.0007356603940327963, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.6814682483673096, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17228615283966064, "step": 8096 }, { "epoch": 0.12090578546950477, "grad_norm": 0.2470703125, "grad_norm_var": 0.0007530053456624349, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.444265842437744, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20155011862516403, "step": 8097 }, { "epoch": 0.12092071764011976, "grad_norm": 0.255859375, "grad_norm_var": 0.0007129510243733724, "learning_rate": 0.0001, "loss": 1.352, "loss/crossentropy": 2.6485191583633423, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19968222081661224, "step": 8098 }, { "epoch": 0.12093564981073474, "grad_norm": 0.26171875, "grad_norm_var": 0.0005162398020426433, "learning_rate": 0.0001, "loss": 1.4437, "loss/crossentropy": 2.508718967437744, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.20934712141752243, "step": 8099 }, { "epoch": 0.12095058198134973, "grad_norm": 0.2216796875, "grad_norm_var": 0.000533449649810791, "learning_rate": 0.0001, "loss": 1.2819, "loss/crossentropy": 2.6092841625213623, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18813813477754593, "step": 8100 }, { "epoch": 0.1209655141519647, "grad_norm": 0.2197265625, "grad_norm_var": 0.0005381107330322266, "learning_rate": 0.0001, "loss": 1.3845, "loss/crossentropy": 2.4246556758880615, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.2086939960718155, "step": 8101 }, { "epoch": 0.12098044632257968, "grad_norm": 0.57421875, "grad_norm_var": 0.006766001383463542, "learning_rate": 0.0001, "loss": 1.7843, "loss/crossentropy": 2.5798369646072388, "loss/fcd": 1.421875, "loss/idx": 10.5, "loss/logits": 0.3624489977955818, "step": 8102 }, { "epoch": 0.12099537849319467, "grad_norm": 0.318359375, "grad_norm_var": 0.006871143976847331, "learning_rate": 0.0001, "loss": 1.5721, "loss/crossentropy": 2.6985236406326294, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.2400946170091629, "step": 8103 }, { "epoch": 0.12101031066380964, "grad_norm": 0.267578125, "grad_norm_var": 0.006820360819498698, "learning_rate": 0.0001, "loss": 1.5791, "loss/crossentropy": 2.449047327041626, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.27051082253456116, "step": 8104 }, { "epoch": 0.12102524283442463, "grad_norm": 0.25390625, "grad_norm_var": 0.006696764628092448, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.6027495861053467, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.1844446286559105, "step": 8105 }, { "epoch": 0.12104017500503961, "grad_norm": 0.259765625, "grad_norm_var": 0.0067288716634114586, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.337839365005493, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20441724359989166, "step": 8106 }, { "epoch": 0.12105510717565458, "grad_norm": 0.265625, "grad_norm_var": 0.0067359288533528645, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.59462833404541, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.18684500455856323, "step": 8107 }, { "epoch": 0.12107003934626957, "grad_norm": 0.23828125, "grad_norm_var": 0.006847318013509115, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.8639023303985596, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16986434906721115, "step": 8108 }, { "epoch": 0.12108497151688455, "grad_norm": 0.2216796875, "grad_norm_var": 0.007005846500396729, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.656415581703186, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20561529695987701, "step": 8109 }, { "epoch": 0.12109990368749954, "grad_norm": 0.2353515625, "grad_norm_var": 0.007033026218414307, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.439162850379944, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.17887534946203232, "step": 8110 }, { "epoch": 0.12111483585811451, "grad_norm": 0.25, "grad_norm_var": 0.007064783573150634, "learning_rate": 0.0001, "loss": 1.3725, "loss/crossentropy": 2.517789602279663, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19668737798929214, "step": 8111 }, { "epoch": 0.12112976802872949, "grad_norm": 0.28515625, "grad_norm_var": 0.007060555617014567, "learning_rate": 0.0001, "loss": 1.2286, "loss/crossentropy": 2.5597970485687256, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16220339387655258, "step": 8112 }, { "epoch": 0.12114470019934448, "grad_norm": 0.279296875, "grad_norm_var": 0.007011906305948893, "learning_rate": 0.0001, "loss": 1.5913, "loss/crossentropy": 2.3827359676361084, "loss/fcd": 1.3203125, "loss/idx": 10.5, "loss/logits": 0.27098412811756134, "step": 8113 }, { "epoch": 0.12115963236995946, "grad_norm": 0.2578125, "grad_norm_var": 0.007007026672363281, "learning_rate": 0.0001, "loss": 1.3296, "loss/crossentropy": 2.6568719148635864, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18506301194429398, "step": 8114 }, { "epoch": 0.12117456454057444, "grad_norm": 0.26171875, "grad_norm_var": 0.007007026672363281, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.619659185409546, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20830000936985016, "step": 8115 }, { "epoch": 0.12118949671118942, "grad_norm": 0.25, "grad_norm_var": 0.006853417555491129, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.6393879652023315, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20568251609802246, "step": 8116 }, { "epoch": 0.12120442888180441, "grad_norm": 0.3984375, "grad_norm_var": 0.007475153605143229, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.586378335952759, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.22734390199184418, "step": 8117 }, { "epoch": 0.12121936105241939, "grad_norm": 0.255859375, "grad_norm_var": 0.0016846815745035807, "learning_rate": 0.0001, "loss": 1.261, "loss/crossentropy": 2.44896399974823, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16334304958581924, "step": 8118 }, { "epoch": 0.12123429322303436, "grad_norm": 0.2578125, "grad_norm_var": 0.0015127182006835938, "learning_rate": 0.0001, "loss": 1.2618, "loss/crossentropy": 2.7137147188186646, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17584489285945892, "step": 8119 }, { "epoch": 0.12124922539364935, "grad_norm": 0.275390625, "grad_norm_var": 0.0015193303426106772, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.451735734939575, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.2324603945016861, "step": 8120 }, { "epoch": 0.12126415756426433, "grad_norm": 0.248046875, "grad_norm_var": 0.0015304406483968099, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.531006932258606, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.1743185669183731, "step": 8121 }, { "epoch": 0.12127908973487932, "grad_norm": 0.2138671875, "grad_norm_var": 0.0016942302385965983, "learning_rate": 0.0001, "loss": 1.3136, "loss/crossentropy": 2.5426151752471924, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18858002871274948, "step": 8122 }, { "epoch": 0.12129402190549429, "grad_norm": 0.330078125, "grad_norm_var": 0.0019837657610575357, "learning_rate": 0.0001, "loss": 1.6765, "loss/crossentropy": 2.7376476526260376, "loss/fcd": 1.3671875, "loss/idx": 10.5, "loss/logits": 0.3093116730451584, "step": 8123 }, { "epoch": 0.12130895407610927, "grad_norm": 0.283203125, "grad_norm_var": 0.0019428213437398276, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.578715682029724, "loss/fcd": 1.3359375, "loss/idx": 10.5, "loss/logits": 0.2556767538189888, "step": 8124 }, { "epoch": 0.12132388624672426, "grad_norm": 0.220703125, "grad_norm_var": 0.0019490400950113933, "learning_rate": 0.0001, "loss": 1.2235, "loss/crossentropy": 2.7615519762039185, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.17271021008491516, "step": 8125 }, { "epoch": 0.12133881841733923, "grad_norm": 0.296875, "grad_norm_var": 0.001910237471262614, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.4240235090255737, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.2053079903125763, "step": 8126 }, { "epoch": 0.12135375058795422, "grad_norm": 0.2578125, "grad_norm_var": 0.0018903374671936034, "learning_rate": 0.0001, "loss": 1.4333, "loss/crossentropy": 2.558429718017578, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.20672284811735153, "step": 8127 }, { "epoch": 0.1213686827585692, "grad_norm": 0.2451171875, "grad_norm_var": 0.0019269943237304687, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.4351738691329956, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.2185075506567955, "step": 8128 }, { "epoch": 0.12138361492918417, "grad_norm": 0.25390625, "grad_norm_var": 0.0019383589426676432, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.695996046066284, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.22603802382946014, "step": 8129 }, { "epoch": 0.12139854709979916, "grad_norm": 0.2197265625, "grad_norm_var": 0.00208666721979777, "learning_rate": 0.0001, "loss": 1.3234, "loss/crossentropy": 2.6595875024795532, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18672076612710953, "step": 8130 }, { "epoch": 0.12141347927041414, "grad_norm": 0.35546875, "grad_norm_var": 0.002572659651438395, "learning_rate": 0.0001, "loss": 1.4383, "loss/crossentropy": 2.572994589805603, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.22739455103874207, "step": 8131 }, { "epoch": 0.12142841144102913, "grad_norm": 0.2236328125, "grad_norm_var": 0.0026957194010416665, "learning_rate": 0.0001, "loss": 1.2177, "loss/crossentropy": 2.8542068004608154, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.15910407155752182, "step": 8132 }, { "epoch": 0.1214433436116441, "grad_norm": 0.224609375, "grad_norm_var": 0.0016305128733317058, "learning_rate": 0.0001, "loss": 1.3641, "loss/crossentropy": 2.7848715782165527, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.21179144829511642, "step": 8133 }, { "epoch": 0.12145827578225908, "grad_norm": 0.2119140625, "grad_norm_var": 0.0017762462298075358, "learning_rate": 0.0001, "loss": 1.2658, "loss/crossentropy": 2.383643627166748, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17594093829393387, "step": 8134 }, { "epoch": 0.12147320795287407, "grad_norm": 0.259765625, "grad_norm_var": 0.001776595910390218, "learning_rate": 0.0001, "loss": 1.5316, "loss/crossentropy": 2.580428719520569, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.2503395080566406, "step": 8135 }, { "epoch": 0.12148814012348905, "grad_norm": 0.259765625, "grad_norm_var": 0.001754597822825114, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.6306440830230713, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.2106218859553337, "step": 8136 }, { "epoch": 0.12150307229410404, "grad_norm": 0.32421875, "grad_norm_var": 0.0020310680071512857, "learning_rate": 0.0001, "loss": 1.5485, "loss/crossentropy": 2.324381709098816, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.21644727885723114, "step": 8137 }, { "epoch": 0.12151800446471901, "grad_norm": 0.271484375, "grad_norm_var": 0.001874224344889323, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.672226905822754, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.23485422134399414, "step": 8138 }, { "epoch": 0.121532936635334, "grad_norm": 0.29296875, "grad_norm_var": 0.0016377607981363932, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.6017720699310303, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20161589235067368, "step": 8139 }, { "epoch": 0.12154786880594898, "grad_norm": 0.25390625, "grad_norm_var": 0.0016108194986979166, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.5726526975631714, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20663633197546005, "step": 8140 }, { "epoch": 0.12156280097656395, "grad_norm": 0.22265625, "grad_norm_var": 0.00160063107808431, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.6024309396743774, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1859448179602623, "step": 8141 }, { "epoch": 0.12157773314717894, "grad_norm": 0.275390625, "grad_norm_var": 0.001526323954264323, "learning_rate": 0.0001, "loss": 1.3375, "loss/crossentropy": 2.6565173864364624, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.17734551429748535, "step": 8142 }, { "epoch": 0.12159266531779392, "grad_norm": 0.2578125, "grad_norm_var": 0.001526323954264323, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.744198799133301, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19802267104387283, "step": 8143 }, { "epoch": 0.12160759748840891, "grad_norm": 0.21484375, "grad_norm_var": 0.0016417463620503744, "learning_rate": 0.0001, "loss": 1.2407, "loss/crossentropy": 2.676413893699646, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.17823028564453125, "step": 8144 }, { "epoch": 0.12162252965902388, "grad_norm": 0.2236328125, "grad_norm_var": 0.0017140547434488932, "learning_rate": 0.0001, "loss": 1.439, "loss/crossentropy": 2.207924246788025, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20850975066423416, "step": 8145 }, { "epoch": 0.12163746182963886, "grad_norm": 0.2119140625, "grad_norm_var": 0.0017553806304931641, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.5716404914855957, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.20279821753501892, "step": 8146 }, { "epoch": 0.12165239400025385, "grad_norm": 0.369140625, "grad_norm_var": 0.001949755350748698, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.715051770210266, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.25196942687034607, "step": 8147 }, { "epoch": 0.12166732617086883, "grad_norm": 0.2119140625, "grad_norm_var": 0.0020090738932291665, "learning_rate": 0.0001, "loss": 1.2388, "loss/crossentropy": 2.685486674308777, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.16846226900815964, "step": 8148 }, { "epoch": 0.12168225834148381, "grad_norm": 0.236328125, "grad_norm_var": 0.001969591776529948, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.6536601781845093, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.2137632817029953, "step": 8149 }, { "epoch": 0.12169719051209879, "grad_norm": 0.2294921875, "grad_norm_var": 0.0018853346506754558, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.7315213680267334, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20990876853466034, "step": 8150 }, { "epoch": 0.12171212268271377, "grad_norm": 0.255859375, "grad_norm_var": 0.0018849531809488932, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.6914652585983276, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.16600695997476578, "step": 8151 }, { "epoch": 0.12172705485332876, "grad_norm": 0.2177734375, "grad_norm_var": 0.0019794424374898274, "learning_rate": 0.0001, "loss": 1.3878, "loss/crossentropy": 2.4707149267196655, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.20425105839967728, "step": 8152 }, { "epoch": 0.12174198702394373, "grad_norm": 0.279296875, "grad_norm_var": 0.0016869823137919107, "learning_rate": 0.0001, "loss": 1.478, "loss/crossentropy": 2.5769460201263428, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.22410277277231216, "step": 8153 }, { "epoch": 0.12175691919455872, "grad_norm": 0.265625, "grad_norm_var": 0.0016735355059305827, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 2.5278531312942505, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17010293900966644, "step": 8154 }, { "epoch": 0.1217718513651737, "grad_norm": 0.2470703125, "grad_norm_var": 0.0015493392944335937, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.687135338783264, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19162975251674652, "step": 8155 }, { "epoch": 0.12178678353578867, "grad_norm": 0.3203125, "grad_norm_var": 0.0018746693929036458, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.7109696865081787, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.2450617104768753, "step": 8156 }, { "epoch": 0.12180171570640366, "grad_norm": 0.318359375, "grad_norm_var": 0.0020670413970947264, "learning_rate": 0.0001, "loss": 1.7083, "loss/crossentropy": 2.4928895235061646, "loss/fcd": 1.44140625, "loss/idx": 10.5, "loss/logits": 0.2669026404619217, "step": 8157 }, { "epoch": 0.12181664787701864, "grad_norm": 0.251953125, "grad_norm_var": 0.002048349380493164, "learning_rate": 0.0001, "loss": 1.3624, "loss/crossentropy": 2.2825907468795776, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.1943955346941948, "step": 8158 }, { "epoch": 0.12183158004763363, "grad_norm": 0.3984375, "grad_norm_var": 0.003300333023071289, "learning_rate": 0.0001, "loss": 1.6486, "loss/crossentropy": 2.9835147857666016, "loss/fcd": 1.40234375, "loss/idx": 10.5, "loss/logits": 0.24620884656906128, "step": 8159 }, { "epoch": 0.1218465122182486, "grad_norm": 0.33984375, "grad_norm_var": 0.0034285068511962892, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.67192280292511, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19824697822332382, "step": 8160 }, { "epoch": 0.1218614443888636, "grad_norm": 0.232421875, "grad_norm_var": 0.003374826908111572, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.605825424194336, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.20836885273456573, "step": 8161 }, { "epoch": 0.12187637655947857, "grad_norm": 0.25390625, "grad_norm_var": 0.0031368096669514975, "learning_rate": 0.0001, "loss": 1.33, "loss/crossentropy": 2.7475627660751343, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18152981996536255, "step": 8162 }, { "epoch": 0.12189130873009354, "grad_norm": 0.26953125, "grad_norm_var": 0.0025296529134114583, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.5844937562942505, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.2042231336236, "step": 8163 }, { "epoch": 0.12190624090070853, "grad_norm": 0.30078125, "grad_norm_var": 0.002328964074452718, "learning_rate": 0.0001, "loss": 1.7125, "loss/crossentropy": 2.6428184509277344, "loss/fcd": 1.40625, "loss/idx": 10.5, "loss/logits": 0.3062002956867218, "step": 8164 }, { "epoch": 0.12192117307132351, "grad_norm": 0.2333984375, "grad_norm_var": 0.0023450215657552084, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.583440065383911, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.21280507743358612, "step": 8165 }, { "epoch": 0.1219361052419385, "grad_norm": 0.19921875, "grad_norm_var": 0.002589539686838786, "learning_rate": 0.0001, "loss": 1.2463, "loss/crossentropy": 2.5648356676101685, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16427789628505707, "step": 8166 }, { "epoch": 0.12195103741255348, "grad_norm": 0.2216796875, "grad_norm_var": 0.002745167414347331, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.6374964714050293, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18736550211906433, "step": 8167 }, { "epoch": 0.12196596958316845, "grad_norm": 0.2265625, "grad_norm_var": 0.002686623732248942, "learning_rate": 0.0001, "loss": 1.2676, "loss/crossentropy": 2.5896657705307007, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1699237823486328, "step": 8168 }, { "epoch": 0.12198090175378344, "grad_norm": 0.234375, "grad_norm_var": 0.002771437168121338, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.709481358528137, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.20650090277194977, "step": 8169 }, { "epoch": 0.12199583392439842, "grad_norm": 0.2578125, "grad_norm_var": 0.00277938445409139, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.3765398263931274, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.20630357414484024, "step": 8170 }, { "epoch": 0.1220107660950134, "grad_norm": 0.263671875, "grad_norm_var": 0.002747837702433268, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.577816128730774, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18122916668653488, "step": 8171 }, { "epoch": 0.12202569826562838, "grad_norm": 0.21875, "grad_norm_var": 0.0027131239573160806, "learning_rate": 0.0001, "loss": 1.3359, "loss/crossentropy": 2.7289960384368896, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.2031262218952179, "step": 8172 }, { "epoch": 0.12204063043624336, "grad_norm": 0.2119140625, "grad_norm_var": 0.0026468555132548015, "learning_rate": 0.0001, "loss": 1.2641, "loss/crossentropy": 2.8619320392608643, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17812998592853546, "step": 8173 }, { "epoch": 0.12205556260685835, "grad_norm": 0.2451171875, "grad_norm_var": 0.0026545047760009764, "learning_rate": 0.0001, "loss": 1.3613, "loss/crossentropy": 2.6330785751342773, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.1933557391166687, "step": 8174 }, { "epoch": 0.12207049477747332, "grad_norm": 0.224609375, "grad_norm_var": 0.0012582778930664063, "learning_rate": 0.0001, "loss": 1.3609, "loss/crossentropy": 2.645016312599182, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.2046273648738861, "step": 8175 }, { "epoch": 0.12208542694808831, "grad_norm": 0.23828125, "grad_norm_var": 0.0006301244099934896, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.5639355182647705, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.211048923432827, "step": 8176 }, { "epoch": 0.12210035911870329, "grad_norm": 0.275390625, "grad_norm_var": 0.0007049560546875, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.412020206451416, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.1843232959508896, "step": 8177 }, { "epoch": 0.12211529128931828, "grad_norm": 0.283203125, "grad_norm_var": 0.0008043766021728516, "learning_rate": 0.0001, "loss": 1.5342, "loss/crossentropy": 2.6623377799987793, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.2529313415288925, "step": 8178 }, { "epoch": 0.12213022345993325, "grad_norm": 0.244140625, "grad_norm_var": 0.0007582982381184896, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.371418595314026, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.24372157454490662, "step": 8179 }, { "epoch": 0.12214515563054823, "grad_norm": 0.458984375, "grad_norm_var": 0.003553374608357747, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.4104071259498596, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.1603219397366047, "step": 8180 }, { "epoch": 0.12216008780116322, "grad_norm": 0.216796875, "grad_norm_var": 0.00361248254776001, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.520206093788147, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1930360198020935, "step": 8181 }, { "epoch": 0.1221750199717782, "grad_norm": 0.2470703125, "grad_norm_var": 0.0034234205881754558, "learning_rate": 0.0001, "loss": 1.253, "loss/crossentropy": 2.7019104957580566, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.18265512585639954, "step": 8182 }, { "epoch": 0.12218995214239319, "grad_norm": 0.234375, "grad_norm_var": 0.0033783237139383953, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.389409899711609, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.21113800257444382, "step": 8183 }, { "epoch": 0.12220488431300816, "grad_norm": 0.2412109375, "grad_norm_var": 0.0033360640207926433, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.584867835044861, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.1782820001244545, "step": 8184 }, { "epoch": 0.12221981648362314, "grad_norm": 0.265625, "grad_norm_var": 0.0033070723215738933, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.6960201263427734, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.21098683774471283, "step": 8185 }, { "epoch": 0.12223474865423813, "grad_norm": 0.224609375, "grad_norm_var": 0.0033765157063802083, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.7566980123519897, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20209477841854095, "step": 8186 }, { "epoch": 0.1222496808248531, "grad_norm": 0.259765625, "grad_norm_var": 0.003373400370279948, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.632195830345154, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1812232807278633, "step": 8187 }, { "epoch": 0.12226461299546809, "grad_norm": 0.26953125, "grad_norm_var": 0.0032849629720052083, "learning_rate": 0.0001, "loss": 1.2772, "loss/crossentropy": 3.0056291818618774, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.15610352903604507, "step": 8188 }, { "epoch": 0.12227954516608307, "grad_norm": 0.263671875, "grad_norm_var": 0.0031289060910542807, "learning_rate": 0.0001, "loss": 1.3505, "loss/crossentropy": 2.8425326347351074, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18646720051765442, "step": 8189 }, { "epoch": 0.12229447733669804, "grad_norm": 0.2578125, "grad_norm_var": 0.003110361099243164, "learning_rate": 0.0001, "loss": 1.2847, "loss/crossentropy": 2.4036026000976562, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.15973328799009323, "step": 8190 }, { "epoch": 0.12230940950731303, "grad_norm": 0.244140625, "grad_norm_var": 0.0030347029368082683, "learning_rate": 0.0001, "loss": 1.3732, "loss/crossentropy": 2.8368067741394043, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20527871698141098, "step": 8191 }, { "epoch": 0.12232434167792801, "grad_norm": 0.2041015625, "grad_norm_var": 0.003225100040435791, "learning_rate": 0.0001, "loss": 1.155, "loss/crossentropy": 2.5339670181274414, "loss/fcd": 1.0078125, "loss/idx": 10.5, "loss/logits": 0.1472197324037552, "step": 8192 }, { "epoch": 0.122339273848543, "grad_norm": 0.259765625, "grad_norm_var": 0.0032122572263081866, "learning_rate": 0.0001, "loss": 1.3182, "loss/crossentropy": 2.58100426197052, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17363370209932327, "step": 8193 }, { "epoch": 0.12235420601915797, "grad_norm": 0.2265625, "grad_norm_var": 0.0032445232073465982, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.619842052459717, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18597564101219177, "step": 8194 }, { "epoch": 0.12236913818977295, "grad_norm": 0.2353515625, "grad_norm_var": 0.0032648722330729167, "learning_rate": 0.0001, "loss": 1.3864, "loss/crossentropy": 2.379082441329956, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19106348603963852, "step": 8195 }, { "epoch": 0.12238407036038794, "grad_norm": 0.23828125, "grad_norm_var": 0.00036060015360514324, "learning_rate": 0.0001, "loss": 1.3838, "loss/crossentropy": 2.699434518814087, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.20022457093000412, "step": 8196 }, { "epoch": 0.12239900253100292, "grad_norm": 0.265625, "grad_norm_var": 0.0003387451171875, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.285479426383972, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1614759862422943, "step": 8197 }, { "epoch": 0.1224139347016179, "grad_norm": 0.22265625, "grad_norm_var": 0.0003728191057840983, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.458121418952942, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.16741392016410828, "step": 8198 }, { "epoch": 0.12242886687223288, "grad_norm": 0.2294921875, "grad_norm_var": 0.00038094520568847655, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 2.341001033782959, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.24291419237852097, "step": 8199 }, { "epoch": 0.12244379904284787, "grad_norm": 0.2734375, "grad_norm_var": 0.000432741641998291, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.4875762462615967, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.2012237012386322, "step": 8200 }, { "epoch": 0.12245873121346285, "grad_norm": 0.28125, "grad_norm_var": 0.0004883090655008952, "learning_rate": 0.0001, "loss": 1.3679, "loss/crossentropy": 2.5736395120620728, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.1882302388548851, "step": 8201 }, { "epoch": 0.12247366338407782, "grad_norm": 0.263671875, "grad_norm_var": 0.0004657387733459473, "learning_rate": 0.0001, "loss": 1.5854, "loss/crossentropy": 2.7406270503997803, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.2611909955739975, "step": 8202 }, { "epoch": 0.12248859555469281, "grad_norm": 0.2119140625, "grad_norm_var": 0.000544595718383789, "learning_rate": 0.0001, "loss": 1.2856, "loss/crossentropy": 2.4213156700134277, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17235078662633896, "step": 8203 }, { "epoch": 0.12250352772530779, "grad_norm": 0.515625, "grad_norm_var": 0.005078744888305664, "learning_rate": 0.0001, "loss": 1.6212, "loss/crossentropy": 2.733823776245117, "loss/fcd": 1.34765625, "loss/idx": 10.5, "loss/logits": 0.2735101282596588, "step": 8204 }, { "epoch": 0.12251845989592278, "grad_norm": 0.2216796875, "grad_norm_var": 0.005180068810780843, "learning_rate": 0.0001, "loss": 1.2964, "loss/crossentropy": 2.761838674545288, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18307503312826157, "step": 8205 }, { "epoch": 0.12253339206653775, "grad_norm": 0.2373046875, "grad_norm_var": 0.00521086057027181, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.5398387908935547, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1849023699760437, "step": 8206 }, { "epoch": 0.12254832423715273, "grad_norm": 0.26171875, "grad_norm_var": 0.005197270711263021, "learning_rate": 0.0001, "loss": 1.3991, "loss/crossentropy": 2.670377254486084, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21164900064468384, "step": 8207 }, { "epoch": 0.12256325640776772, "grad_norm": 0.224609375, "grad_norm_var": 0.0050726850827535, "learning_rate": 0.0001, "loss": 1.3581, "loss/crossentropy": 2.467139720916748, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19792795926332474, "step": 8208 }, { "epoch": 0.1225781885783827, "grad_norm": 0.2314453125, "grad_norm_var": 0.005125808715820313, "learning_rate": 0.0001, "loss": 1.4244, "loss/crossentropy": 2.706975221633911, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.217376708984375, "step": 8209 }, { "epoch": 0.12259312074899768, "grad_norm": 0.208984375, "grad_norm_var": 0.005220651626586914, "learning_rate": 0.0001, "loss": 1.2384, "loss/crossentropy": 2.731396198272705, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.16807913035154343, "step": 8210 }, { "epoch": 0.12260805291961266, "grad_norm": 0.236328125, "grad_norm_var": 0.00521780252456665, "learning_rate": 0.0001, "loss": 1.2872, "loss/crossentropy": 2.629850745201111, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1738840937614441, "step": 8211 }, { "epoch": 0.12262298509022763, "grad_norm": 0.265625, "grad_norm_var": 0.005193547407786051, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.4092178344726562, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.21101181209087372, "step": 8212 }, { "epoch": 0.12263791726084262, "grad_norm": 0.224609375, "grad_norm_var": 0.00526497761408488, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.713664412498474, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18273881077766418, "step": 8213 }, { "epoch": 0.1226528494314576, "grad_norm": 0.30859375, "grad_norm_var": 0.005334214369455973, "learning_rate": 0.0001, "loss": 1.9271, "loss/crossentropy": 2.6105170249938965, "loss/fcd": 1.56640625, "loss/idx": 10.5, "loss/logits": 0.36070725321769714, "step": 8214 }, { "epoch": 0.12266778160207259, "grad_norm": 0.28515625, "grad_norm_var": 0.005284611384073893, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.6189783811569214, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18978708237409592, "step": 8215 }, { "epoch": 0.12268271377268757, "grad_norm": 0.28125, "grad_norm_var": 0.00529643694559733, "learning_rate": 0.0001, "loss": 1.2511, "loss/crossentropy": 2.765543580055237, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.1651817113161087, "step": 8216 }, { "epoch": 0.12269764594330254, "grad_norm": 0.234375, "grad_norm_var": 0.005339924494425456, "learning_rate": 0.0001, "loss": 1.4307, "loss/crossentropy": 2.7304043769836426, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.21193015575408936, "step": 8217 }, { "epoch": 0.12271257811391753, "grad_norm": 0.283203125, "grad_norm_var": 0.005364720026652018, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.486623764038086, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.2026655599474907, "step": 8218 }, { "epoch": 0.1227275102845325, "grad_norm": 0.2294921875, "grad_norm_var": 0.0052607218424479164, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.571283459663391, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17810538411140442, "step": 8219 }, { "epoch": 0.1227424424551475, "grad_norm": 0.2392578125, "grad_norm_var": 0.0008221586545308431, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.634053945541382, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.20640705525875092, "step": 8220 }, { "epoch": 0.12275737462576247, "grad_norm": 0.2158203125, "grad_norm_var": 0.0008451422055562337, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.5141202211380005, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17436234652996063, "step": 8221 }, { "epoch": 0.12277230679637746, "grad_norm": 0.24609375, "grad_norm_var": 0.0008374532063802083, "learning_rate": 0.0001, "loss": 1.2461, "loss/crossentropy": 2.853850245475769, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16404956579208374, "step": 8222 }, { "epoch": 0.12278723896699244, "grad_norm": 0.2119140625, "grad_norm_var": 0.0009049375851949056, "learning_rate": 0.0001, "loss": 1.2159, "loss/crossentropy": 2.663277745246887, "loss/fcd": 1.048828125, "loss/idx": 10.5, "loss/logits": 0.167105995118618, "step": 8223 }, { "epoch": 0.12280217113760741, "grad_norm": 0.2353515625, "grad_norm_var": 0.0008823394775390625, "learning_rate": 0.0001, "loss": 1.2069, "loss/crossentropy": 2.5149346590042114, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.1522623524069786, "step": 8224 }, { "epoch": 0.1228171033082224, "grad_norm": 0.21875, "grad_norm_var": 0.0009172081947326661, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.545382499694824, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1702507957816124, "step": 8225 }, { "epoch": 0.12283203547883738, "grad_norm": 0.240234375, "grad_norm_var": 0.0008269270261128743, "learning_rate": 0.0001, "loss": 1.3698, "loss/crossentropy": 2.629360795021057, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20968961715698242, "step": 8226 }, { "epoch": 0.12284696764945237, "grad_norm": 0.28125, "grad_norm_var": 0.000887612501780192, "learning_rate": 0.0001, "loss": 1.5703, "loss/crossentropy": 2.653886914253235, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2656576409935951, "step": 8227 }, { "epoch": 0.12286189982006734, "grad_norm": 0.2255859375, "grad_norm_var": 0.0009047190348307291, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.660448431968689, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18279803544282913, "step": 8228 }, { "epoch": 0.12287683199068232, "grad_norm": 0.220703125, "grad_norm_var": 0.0009176254272460938, "learning_rate": 0.0001, "loss": 1.3157, "loss/crossentropy": 2.6398128271102905, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1789441481232643, "step": 8229 }, { "epoch": 0.12289176416129731, "grad_norm": 0.26953125, "grad_norm_var": 0.0006938298543294271, "learning_rate": 0.0001, "loss": 1.5221, "loss/crossentropy": 2.68191921710968, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.22911451756954193, "step": 8230 }, { "epoch": 0.12290669633191229, "grad_norm": 0.259765625, "grad_norm_var": 0.0005977471669514974, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.7638648748397827, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.19682494550943375, "step": 8231 }, { "epoch": 0.12292162850252727, "grad_norm": 0.21875, "grad_norm_var": 0.000525522232055664, "learning_rate": 0.0001, "loss": 1.2034, "loss/crossentropy": 2.6201820373535156, "loss/fcd": 1.041015625, "loss/idx": 10.5, "loss/logits": 0.1623442843556404, "step": 8232 }, { "epoch": 0.12293656067314225, "grad_norm": 0.296875, "grad_norm_var": 0.0007279555002848308, "learning_rate": 0.0001, "loss": 1.5915, "loss/crossentropy": 2.546459674835205, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.23208534717559814, "step": 8233 }, { "epoch": 0.12295149284375723, "grad_norm": 0.212890625, "grad_norm_var": 0.0006627241770426432, "learning_rate": 0.0001, "loss": 1.2969, "loss/crossentropy": 2.587792754173279, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17967675626277924, "step": 8234 }, { "epoch": 0.12296642501437222, "grad_norm": 0.326171875, "grad_norm_var": 0.0011257449785868326, "learning_rate": 0.0001, "loss": 1.8418, "loss/crossentropy": 2.5991783142089844, "loss/fcd": 1.4375, "loss/idx": 10.5, "loss/logits": 0.40429647266864777, "step": 8235 }, { "epoch": 0.12298135718498719, "grad_norm": 0.255859375, "grad_norm_var": 0.0011304060618082682, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.4827386140823364, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.23786406219005585, "step": 8236 }, { "epoch": 0.12299628935560218, "grad_norm": 0.234375, "grad_norm_var": 0.0010773301124572753, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.4904483556747437, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.17971380054950714, "step": 8237 }, { "epoch": 0.12301122152621716, "grad_norm": 0.2490234375, "grad_norm_var": 0.0010774612426757812, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.5346293449401855, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21733683347702026, "step": 8238 }, { "epoch": 0.12302615369683215, "grad_norm": 0.251953125, "grad_norm_var": 0.0009886701901753745, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.7013145685195923, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.23270970582962036, "step": 8239 }, { "epoch": 0.12304108586744712, "grad_norm": 0.263671875, "grad_norm_var": 0.00098417599995931, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.5128180980682373, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.2464551329612732, "step": 8240 }, { "epoch": 0.1230560180380621, "grad_norm": 0.283203125, "grad_norm_var": 0.0009616216023763021, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.554659843444824, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.2233327552676201, "step": 8241 }, { "epoch": 0.12307095020867709, "grad_norm": 0.259765625, "grad_norm_var": 0.0009454091389973959, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.423561215400696, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.18802157044410706, "step": 8242 }, { "epoch": 0.12308588237929206, "grad_norm": 0.240234375, "grad_norm_var": 0.0009170373280843099, "learning_rate": 0.0001, "loss": 1.3299, "loss/crossentropy": 2.472571551799774, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18533362448215485, "step": 8243 }, { "epoch": 0.12310081454990705, "grad_norm": 0.2373046875, "grad_norm_var": 0.0008807977040608724, "learning_rate": 0.0001, "loss": 1.3457, "loss/crossentropy": 2.7118953466415405, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.18945875763893127, "step": 8244 }, { "epoch": 0.12311574672052203, "grad_norm": 0.2255859375, "grad_norm_var": 0.0008599559466044108, "learning_rate": 0.0001, "loss": 1.2856, "loss/crossentropy": 2.6729562282562256, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17228738218545914, "step": 8245 }, { "epoch": 0.123130678891137, "grad_norm": 0.27734375, "grad_norm_var": 0.0008785843849182129, "learning_rate": 0.0001, "loss": 1.2754, "loss/crossentropy": 2.645159602165222, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.16994920372962952, "step": 8246 }, { "epoch": 0.123145611061752, "grad_norm": 0.27734375, "grad_norm_var": 0.0009071946144104004, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.4704636335372925, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.22191842645406723, "step": 8247 }, { "epoch": 0.12316054323236697, "grad_norm": 0.263671875, "grad_norm_var": 0.0008048335711161296, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.452498197555542, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1936960369348526, "step": 8248 }, { "epoch": 0.12317547540298196, "grad_norm": 0.271484375, "grad_norm_var": 0.0007192889849344889, "learning_rate": 0.0001, "loss": 1.6526, "loss/crossentropy": 2.7627243995666504, "loss/fcd": 1.3515625, "loss/idx": 10.5, "loss/logits": 0.30102330446243286, "step": 8249 }, { "epoch": 0.12319040757359694, "grad_norm": 0.2138671875, "grad_norm_var": 0.0007134596506754557, "learning_rate": 0.0001, "loss": 1.3186, "loss/crossentropy": 2.7098021507263184, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19363847374916077, "step": 8250 }, { "epoch": 0.12320533974421191, "grad_norm": 0.205078125, "grad_norm_var": 0.0005321343739827474, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.5237812995910645, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19901379942893982, "step": 8251 }, { "epoch": 0.1232202719148269, "grad_norm": 0.2265625, "grad_norm_var": 0.0005652745564778646, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.6959118843078613, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.17037105560302734, "step": 8252 }, { "epoch": 0.12323520408544188, "grad_norm": 0.2294921875, "grad_norm_var": 0.0005761424700419109, "learning_rate": 0.0001, "loss": 1.4071, "loss/crossentropy": 2.492151379585266, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.20785921812057495, "step": 8253 }, { "epoch": 0.12325013625605687, "grad_norm": 0.23828125, "grad_norm_var": 0.0005825678507486979, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.422188401222229, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19100165367126465, "step": 8254 }, { "epoch": 0.12326506842667184, "grad_norm": 0.248046875, "grad_norm_var": 0.00058135986328125, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.6076961755752563, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.23315995931625366, "step": 8255 }, { "epoch": 0.12328000059728682, "grad_norm": 0.2353515625, "grad_norm_var": 0.0005706429481506347, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.3456549644470215, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.18768469989299774, "step": 8256 }, { "epoch": 0.12329493276790181, "grad_norm": 0.2109375, "grad_norm_var": 0.0005365331967671712, "learning_rate": 0.0001, "loss": 1.2181, "loss/crossentropy": 2.58591890335083, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.16337639093399048, "step": 8257 }, { "epoch": 0.12330986493851678, "grad_norm": 0.4296875, "grad_norm_var": 0.0027601202328999836, "learning_rate": 0.0001, "loss": 2.0247, "loss/crossentropy": 2.712099552154541, "loss/fcd": 1.5546875, "loss/idx": 10.5, "loss/logits": 0.4700271934270859, "step": 8258 }, { "epoch": 0.12332479710913177, "grad_norm": 0.2431640625, "grad_norm_var": 0.002756102879842122, "learning_rate": 0.0001, "loss": 1.3326, "loss/crossentropy": 2.429970860481262, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1763686239719391, "step": 8259 }, { "epoch": 0.12333972927974675, "grad_norm": 0.255859375, "grad_norm_var": 0.002741078535715739, "learning_rate": 0.0001, "loss": 1.4694, "loss/crossentropy": 2.4826749563217163, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.2154827117919922, "step": 8260 }, { "epoch": 0.12335466145036174, "grad_norm": 0.2392578125, "grad_norm_var": 0.002702359358469645, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.814319372177124, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.1923171430826187, "step": 8261 }, { "epoch": 0.12336959362097671, "grad_norm": 0.29296875, "grad_norm_var": 0.002766064802805583, "learning_rate": 0.0001, "loss": 1.6131, "loss/crossentropy": 2.5114020109176636, "loss/fcd": 1.37109375, "loss/idx": 10.5, "loss/logits": 0.24195797741413116, "step": 8262 }, { "epoch": 0.12338452579159169, "grad_norm": 0.287109375, "grad_norm_var": 0.002801032861073812, "learning_rate": 0.0001, "loss": 1.3671, "loss/crossentropy": 2.615282654762268, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19524535536766052, "step": 8263 }, { "epoch": 0.12339945796220668, "grad_norm": 0.25, "grad_norm_var": 0.002798140048980713, "learning_rate": 0.0001, "loss": 1.3056, "loss/crossentropy": 2.652395486831665, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.176737979054451, "step": 8264 }, { "epoch": 0.12341439013282166, "grad_norm": 0.2353515625, "grad_norm_var": 0.0027994632720947264, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.618975281715393, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18412014842033386, "step": 8265 }, { "epoch": 0.12342932230343664, "grad_norm": 0.2373046875, "grad_norm_var": 0.002712869644165039, "learning_rate": 0.0001, "loss": 1.4503, "loss/crossentropy": 2.725832939147949, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.21979530155658722, "step": 8266 }, { "epoch": 0.12344425447405162, "grad_norm": 0.2578125, "grad_norm_var": 0.0025424957275390625, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.771660327911377, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.2107020616531372, "step": 8267 }, { "epoch": 0.1234591866446666, "grad_norm": 0.263671875, "grad_norm_var": 0.002476358413696289, "learning_rate": 0.0001, "loss": 1.327, "loss/crossentropy": 2.4338568449020386, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18633267283439636, "step": 8268 }, { "epoch": 0.12347411881528159, "grad_norm": 0.2470703125, "grad_norm_var": 0.0024250030517578127, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.579113245010376, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18614234030246735, "step": 8269 }, { "epoch": 0.12348905098589656, "grad_norm": 0.244140625, "grad_norm_var": 0.0024096012115478516, "learning_rate": 0.0001, "loss": 1.3445, "loss/crossentropy": 2.627328038215637, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.192192941904068, "step": 8270 }, { "epoch": 0.12350398315651155, "grad_norm": 0.21484375, "grad_norm_var": 0.002536328633626302, "learning_rate": 0.0001, "loss": 1.3687, "loss/crossentropy": 2.5099719762802124, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20464444160461426, "step": 8271 }, { "epoch": 0.12351891532712653, "grad_norm": 0.234375, "grad_norm_var": 0.002539471785227458, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.4256705045700073, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.17814695090055466, "step": 8272 }, { "epoch": 0.1235338474977415, "grad_norm": 0.216796875, "grad_norm_var": 0.0025040904680887857, "learning_rate": 0.0001, "loss": 1.3565, "loss/crossentropy": 2.428445816040039, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18852580338716507, "step": 8273 }, { "epoch": 0.12354877966835649, "grad_norm": 0.2236328125, "grad_norm_var": 0.0004775842030843099, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.632333517074585, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.1846868097782135, "step": 8274 }, { "epoch": 0.12356371183897147, "grad_norm": 0.23828125, "grad_norm_var": 0.00048122008641560875, "learning_rate": 0.0001, "loss": 1.4562, "loss/crossentropy": 2.5652981996536255, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.2413809895515442, "step": 8275 }, { "epoch": 0.12357864400958646, "grad_norm": 0.267578125, "grad_norm_var": 0.0005049665768941243, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.5938680171966553, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.2081414759159088, "step": 8276 }, { "epoch": 0.12359357618020143, "grad_norm": 0.255859375, "grad_norm_var": 0.0005053043365478516, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.4059261083602905, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.22484656423330307, "step": 8277 }, { "epoch": 0.12360850835081641, "grad_norm": 0.2255859375, "grad_norm_var": 0.00038439035415649414, "learning_rate": 0.0001, "loss": 1.224, "loss/crossentropy": 2.7816165685653687, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.16927455365657806, "step": 8278 }, { "epoch": 0.1236234405214314, "grad_norm": 0.2099609375, "grad_norm_var": 0.0003099918365478516, "learning_rate": 0.0001, "loss": 1.2264, "loss/crossentropy": 2.413386821746826, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.15610186755657196, "step": 8279 }, { "epoch": 0.12363837269204637, "grad_norm": 0.2578125, "grad_norm_var": 0.0003253777821858724, "learning_rate": 0.0001, "loss": 1.3622, "loss/crossentropy": 2.7700181007385254, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19818702340126038, "step": 8280 }, { "epoch": 0.12365330486266136, "grad_norm": 0.279296875, "grad_norm_var": 0.00042247374852498373, "learning_rate": 0.0001, "loss": 1.2887, "loss/crossentropy": 2.6311910152435303, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17541036754846573, "step": 8281 }, { "epoch": 0.12366823703327634, "grad_norm": 0.244140625, "grad_norm_var": 0.00042099952697753905, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.7561652660369873, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20533449947834015, "step": 8282 }, { "epoch": 0.12368316920389133, "grad_norm": 0.259765625, "grad_norm_var": 0.0004252115885416667, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.4376840591430664, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1836751624941826, "step": 8283 }, { "epoch": 0.1236981013745063, "grad_norm": 0.28515625, "grad_norm_var": 0.0005142052968343098, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.784640908241272, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.20240579545497894, "step": 8284 }, { "epoch": 0.12371303354512128, "grad_norm": 0.2734375, "grad_norm_var": 0.0005683859189351399, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.508130669593811, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.1984042301774025, "step": 8285 }, { "epoch": 0.12372796571573627, "grad_norm": 0.2890625, "grad_norm_var": 0.0006853699684143067, "learning_rate": 0.0001, "loss": 1.4494, "loss/crossentropy": 2.591382145881653, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.20725444704294205, "step": 8286 }, { "epoch": 0.12374289788635125, "grad_norm": 0.33984375, "grad_norm_var": 0.0011014262835184732, "learning_rate": 0.0001, "loss": 1.8332, "loss/crossentropy": 2.55853009223938, "loss/fcd": 1.51953125, "loss/idx": 10.5, "loss/logits": 0.3136419206857681, "step": 8287 }, { "epoch": 0.12375783005696624, "grad_norm": 0.27734375, "grad_norm_var": 0.0010912855466206869, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.793673515319824, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.22956513613462448, "step": 8288 }, { "epoch": 0.12377276222758121, "grad_norm": 0.2109375, "grad_norm_var": 0.0011263807614644368, "learning_rate": 0.0001, "loss": 1.2989, "loss/crossentropy": 2.6723365783691406, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18167795985937119, "step": 8289 }, { "epoch": 0.12378769439819619, "grad_norm": 0.30078125, "grad_norm_var": 0.0011386235555013021, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.747464895248413, "loss/fcd": 1.36328125, "loss/idx": 10.5, "loss/logits": 0.2821616530418396, "step": 8290 }, { "epoch": 0.12380262656881118, "grad_norm": 0.2197265625, "grad_norm_var": 0.0012223521868387859, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.7001872062683105, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.16665779054164886, "step": 8291 }, { "epoch": 0.12381755873942615, "grad_norm": 0.236328125, "grad_norm_var": 0.0012612620989481608, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.8343948125839233, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18989094346761703, "step": 8292 }, { "epoch": 0.12383249091004114, "grad_norm": 0.283203125, "grad_norm_var": 0.0012917478879292807, "learning_rate": 0.0001, "loss": 1.5237, "loss/crossentropy": 2.9056427478790283, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.2151138186454773, "step": 8293 }, { "epoch": 0.12384742308065612, "grad_norm": 0.2578125, "grad_norm_var": 0.001200087865193685, "learning_rate": 0.0001, "loss": 1.2821, "loss/crossentropy": 2.7596877813339233, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17270179092884064, "step": 8294 }, { "epoch": 0.1238623552512711, "grad_norm": 0.240234375, "grad_norm_var": 0.0010390877723693848, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.535893201828003, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.20098591595888138, "step": 8295 }, { "epoch": 0.12387728742188608, "grad_norm": 0.259765625, "grad_norm_var": 0.0010372122128804524, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.4558866024017334, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.21346549689769745, "step": 8296 }, { "epoch": 0.12389221959250106, "grad_norm": 0.251953125, "grad_norm_var": 0.0010356545448303224, "learning_rate": 0.0001, "loss": 1.3801, "loss/crossentropy": 2.6553537845611572, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20429591834545135, "step": 8297 }, { "epoch": 0.12390715176311605, "grad_norm": 0.21875, "grad_norm_var": 0.0011443416277567545, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.5018280744552612, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18458928167819977, "step": 8298 }, { "epoch": 0.12392208393373103, "grad_norm": 0.2021484375, "grad_norm_var": 0.0013748009999593098, "learning_rate": 0.0001, "loss": 1.1541, "loss/crossentropy": 2.46934175491333, "loss/fcd": 1.0078125, "loss/idx": 10.5, "loss/logits": 0.1462489366531372, "step": 8299 }, { "epoch": 0.12393701610434602, "grad_norm": 0.251953125, "grad_norm_var": 0.0013285954793294272, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.7819803953170776, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21026109158992767, "step": 8300 }, { "epoch": 0.12395194827496099, "grad_norm": 0.2353515625, "grad_norm_var": 0.0013361891110738118, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.761733889579773, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.25113022327423096, "step": 8301 }, { "epoch": 0.12396688044557597, "grad_norm": 0.2294921875, "grad_norm_var": 0.0012850443522135417, "learning_rate": 0.0001, "loss": 1.282, "loss/crossentropy": 2.792596697807312, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17265793681144714, "step": 8302 }, { "epoch": 0.12398181261619096, "grad_norm": 0.251953125, "grad_norm_var": 0.0007264296213785808, "learning_rate": 0.0001, "loss": 1.5193, "loss/crossentropy": 2.6615042686462402, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.24197861552238464, "step": 8303 }, { "epoch": 0.12399674478680593, "grad_norm": 0.2470703125, "grad_norm_var": 0.0006551067034403483, "learning_rate": 0.0001, "loss": 1.5051, "loss/crossentropy": 2.5475796461105347, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.21211780607700348, "step": 8304 }, { "epoch": 0.12401167695742092, "grad_norm": 0.2060546875, "grad_norm_var": 0.0006778558095296224, "learning_rate": 0.0001, "loss": 1.268, "loss/crossentropy": 2.5388343334198, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1781930774450302, "step": 8305 }, { "epoch": 0.1240266091280359, "grad_norm": 0.240234375, "grad_norm_var": 0.00044282277425130207, "learning_rate": 0.0001, "loss": 1.3688, "loss/crossentropy": 2.596130609512329, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1969696432352066, "step": 8306 }, { "epoch": 0.12404154129865087, "grad_norm": 0.2470703125, "grad_norm_var": 0.00041745503743489585, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.498867630958557, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19338110834360123, "step": 8307 }, { "epoch": 0.12405647346926586, "grad_norm": 0.224609375, "grad_norm_var": 0.0004336675008138021, "learning_rate": 0.0001, "loss": 1.2359, "loss/crossentropy": 2.56675124168396, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1616622433066368, "step": 8308 }, { "epoch": 0.12407140563988084, "grad_norm": 0.2080078125, "grad_norm_var": 0.0003587047259012858, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.5478742122650146, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18435630202293396, "step": 8309 }, { "epoch": 0.12408633781049583, "grad_norm": 0.279296875, "grad_norm_var": 0.0004506707191467285, "learning_rate": 0.0001, "loss": 1.395, "loss/crossentropy": 2.9382851123809814, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.18801037967205048, "step": 8310 }, { "epoch": 0.1241012699811108, "grad_norm": 0.259765625, "grad_norm_var": 0.0004826188087463379, "learning_rate": 0.0001, "loss": 1.3749, "loss/crossentropy": 2.760164976119995, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19913151115179062, "step": 8311 }, { "epoch": 0.12411620215172578, "grad_norm": 0.21484375, "grad_norm_var": 0.0004804253578186035, "learning_rate": 0.0001, "loss": 1.2613, "loss/crossentropy": 2.6228415966033936, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17144078761339188, "step": 8312 }, { "epoch": 0.12413113432234077, "grad_norm": 0.2275390625, "grad_norm_var": 0.0004642327626546224, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.6044193506240845, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18831752985715866, "step": 8313 }, { "epoch": 0.12414606649295575, "grad_norm": 0.318359375, "grad_norm_var": 0.0008817036946614583, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.6833072900772095, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.1742713302373886, "step": 8314 }, { "epoch": 0.12416099866357073, "grad_norm": 0.2216796875, "grad_norm_var": 0.0008063634236653646, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.3316094875335693, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.17936241626739502, "step": 8315 }, { "epoch": 0.12417593083418571, "grad_norm": 0.224609375, "grad_norm_var": 0.0008148193359375, "learning_rate": 0.0001, "loss": 1.2989, "loss/crossentropy": 2.654107451438904, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18559648096561432, "step": 8316 }, { "epoch": 0.12419086300480069, "grad_norm": 0.23828125, "grad_norm_var": 0.0008136391639709473, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.7166839838027954, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1971667930483818, "step": 8317 }, { "epoch": 0.12420579517541568, "grad_norm": 0.21484375, "grad_norm_var": 0.0008474349975585938, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.508817434310913, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20737113803625107, "step": 8318 }, { "epoch": 0.12422072734603065, "grad_norm": 0.2578125, "grad_norm_var": 0.0008596897125244141, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.5417022705078125, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.2069500833749771, "step": 8319 }, { "epoch": 0.12423565951664564, "grad_norm": 0.228515625, "grad_norm_var": 0.0008621811866760254, "learning_rate": 0.0001, "loss": 1.4273, "loss/crossentropy": 2.4648038148880005, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.18905653804540634, "step": 8320 }, { "epoch": 0.12425059168726062, "grad_norm": 0.208984375, "grad_norm_var": 0.0008501529693603516, "learning_rate": 0.0001, "loss": 1.2646, "loss/crossentropy": 2.5159281492233276, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.17084641754627228, "step": 8321 }, { "epoch": 0.1242655238578756, "grad_norm": 0.2236328125, "grad_norm_var": 0.0008633255958557129, "learning_rate": 0.0001, "loss": 1.2813, "loss/crossentropy": 2.7621541023254395, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1758154332637787, "step": 8322 }, { "epoch": 0.12428045602849058, "grad_norm": 0.2138671875, "grad_norm_var": 0.0008892655372619629, "learning_rate": 0.0001, "loss": 1.2533, "loss/crossentropy": 2.585997700691223, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1712523102760315, "step": 8323 }, { "epoch": 0.12429538819910556, "grad_norm": 0.2451171875, "grad_norm_var": 0.0008863449096679687, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.5279855728149414, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.20368676632642746, "step": 8324 }, { "epoch": 0.12431032036972055, "grad_norm": 0.2392578125, "grad_norm_var": 0.0008283615112304688, "learning_rate": 0.0001, "loss": 1.3201, "loss/crossentropy": 2.4104071855545044, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19510551542043686, "step": 8325 }, { "epoch": 0.12432525254033552, "grad_norm": 0.228515625, "grad_norm_var": 0.0007134755452473958, "learning_rate": 0.0001, "loss": 1.2412, "loss/crossentropy": 2.4262683391571045, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.17088457196950912, "step": 8326 }, { "epoch": 0.12434018471095051, "grad_norm": 0.21875, "grad_norm_var": 0.0006851037343343099, "learning_rate": 0.0001, "loss": 1.3243, "loss/crossentropy": 2.656522512435913, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17973888665437698, "step": 8327 }, { "epoch": 0.12435511688156549, "grad_norm": 0.22265625, "grad_norm_var": 0.0006702264149983724, "learning_rate": 0.0001, "loss": 1.257, "loss/crossentropy": 2.613413095474243, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.18274237215518951, "step": 8328 }, { "epoch": 0.12437004905218046, "grad_norm": 0.2041015625, "grad_norm_var": 0.0007224877675374349, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.5222750902175903, "loss/fcd": 1.0390625, "loss/idx": 10.5, "loss/logits": 0.15763292461633682, "step": 8329 }, { "epoch": 0.12438498122279545, "grad_norm": 0.2294921875, "grad_norm_var": 0.00019057194391886394, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.647510051727295, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1958184689283371, "step": 8330 }, { "epoch": 0.12439991339341043, "grad_norm": 0.2158203125, "grad_norm_var": 0.00019629398981730144, "learning_rate": 0.0001, "loss": 1.1895, "loss/crossentropy": 2.499895691871643, "loss/fcd": 1.037109375, "loss/idx": 10.5, "loss/logits": 0.1523805931210518, "step": 8331 }, { "epoch": 0.12441484556402542, "grad_norm": 0.2177734375, "grad_norm_var": 0.00020038286844889324, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.8567947149276733, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18729059398174286, "step": 8332 }, { "epoch": 0.1244297777346404, "grad_norm": 0.255859375, "grad_norm_var": 0.0002497355143229167, "learning_rate": 0.0001, "loss": 1.4061, "loss/crossentropy": 2.5558489561080933, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.21466901153326035, "step": 8333 }, { "epoch": 0.12444470990525537, "grad_norm": 0.2431640625, "grad_norm_var": 0.00025561253229777016, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.4270009994506836, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.19599098712205887, "step": 8334 }, { "epoch": 0.12445964207587036, "grad_norm": 0.287109375, "grad_norm_var": 0.0004244128863016764, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.770441770553589, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.21644911170005798, "step": 8335 }, { "epoch": 0.12447457424648534, "grad_norm": 0.2265625, "grad_norm_var": 0.0004250804583231608, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.6198976039886475, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.17319904267787933, "step": 8336 }, { "epoch": 0.12448950641710033, "grad_norm": 0.2294921875, "grad_norm_var": 0.00039378801981608075, "learning_rate": 0.0001, "loss": 1.5623, "loss/crossentropy": 2.445886969566345, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.28885263949632645, "step": 8337 }, { "epoch": 0.1245044385877153, "grad_norm": 0.220703125, "grad_norm_var": 0.00039732853571573893, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.4683183431625366, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.19326594471931458, "step": 8338 }, { "epoch": 0.12451937075833028, "grad_norm": 0.205078125, "grad_norm_var": 0.00042239824930826825, "learning_rate": 0.0001, "loss": 1.3072, "loss/crossentropy": 2.5345152616500854, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18217548727989197, "step": 8339 }, { "epoch": 0.12453430292894527, "grad_norm": 0.251953125, "grad_norm_var": 0.00043855905532836915, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.713814616203308, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17864346504211426, "step": 8340 }, { "epoch": 0.12454923509956024, "grad_norm": 0.2392578125, "grad_norm_var": 0.00043855905532836915, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.799055576324463, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2192879542708397, "step": 8341 }, { "epoch": 0.12456416727017523, "grad_norm": 0.28125, "grad_norm_var": 0.0005947709083557129, "learning_rate": 0.0001, "loss": 1.5639, "loss/crossentropy": 2.4893194437026978, "loss/fcd": 1.35546875, "loss/idx": 10.5, "loss/logits": 0.2084297016263008, "step": 8342 }, { "epoch": 0.12457909944079021, "grad_norm": 0.255859375, "grad_norm_var": 0.0006038308143615723, "learning_rate": 0.0001, "loss": 1.4948, "loss/crossentropy": 2.570776104927063, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.23310627788305283, "step": 8343 }, { "epoch": 0.1245940316114052, "grad_norm": 0.216796875, "grad_norm_var": 0.0006168961524963378, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.6127768754959106, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17383740842342377, "step": 8344 }, { "epoch": 0.12460896378202017, "grad_norm": 0.2177734375, "grad_norm_var": 0.0005699435869852702, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.652638554573059, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20041105151176453, "step": 8345 }, { "epoch": 0.12462389595263515, "grad_norm": 0.2578125, "grad_norm_var": 0.0005912621815999349, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.7564412355422974, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18479035794734955, "step": 8346 }, { "epoch": 0.12463882812325014, "grad_norm": 0.21484375, "grad_norm_var": 0.00059432586034139, "learning_rate": 0.0001, "loss": 1.3357, "loss/crossentropy": 2.64360249042511, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.2029108926653862, "step": 8347 }, { "epoch": 0.12465376029386512, "grad_norm": 0.380859375, "grad_norm_var": 0.0017987569173177084, "learning_rate": 0.0001, "loss": 1.4677, "loss/crossentropy": 2.7308390140533447, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2880091220140457, "step": 8348 }, { "epoch": 0.1246686924644801, "grad_norm": 0.201171875, "grad_norm_var": 0.0019358317057291666, "learning_rate": 0.0001, "loss": 1.2278, "loss/crossentropy": 2.5590932369232178, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.16925039142370224, "step": 8349 }, { "epoch": 0.12468362463509508, "grad_norm": 0.2099609375, "grad_norm_var": 0.002015542984008789, "learning_rate": 0.0001, "loss": 1.2267, "loss/crossentropy": 2.623908281326294, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.1720002517104149, "step": 8350 }, { "epoch": 0.12469855680571006, "grad_norm": 0.2490234375, "grad_norm_var": 0.001884901523590088, "learning_rate": 0.0001, "loss": 1.3806, "loss/crossentropy": 2.6478182077407837, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20089808106422424, "step": 8351 }, { "epoch": 0.12471348897632505, "grad_norm": 0.2216796875, "grad_norm_var": 0.001895888646443685, "learning_rate": 0.0001, "loss": 1.2362, "loss/crossentropy": 2.4365001916885376, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16983553022146225, "step": 8352 }, { "epoch": 0.12472842114694002, "grad_norm": 0.2158203125, "grad_norm_var": 0.0019282658894856771, "learning_rate": 0.0001, "loss": 1.255, "loss/crossentropy": 2.446800112724304, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.16906462609767914, "step": 8353 }, { "epoch": 0.12474335331755501, "grad_norm": 0.271484375, "grad_norm_var": 0.0019588470458984375, "learning_rate": 0.0001, "loss": 1.3209, "loss/crossentropy": 2.3933218717575073, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.17248890548944473, "step": 8354 }, { "epoch": 0.12475828548816999, "grad_norm": 0.26171875, "grad_norm_var": 0.0018717288970947266, "learning_rate": 0.0001, "loss": 1.3956, "loss/crossentropy": 2.9534326791763306, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2159244567155838, "step": 8355 }, { "epoch": 0.12477321765878496, "grad_norm": 0.2470703125, "grad_norm_var": 0.0018698016802469889, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.5212695598602295, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.2022245079278946, "step": 8356 }, { "epoch": 0.12478814982939995, "grad_norm": 0.2578125, "grad_norm_var": 0.0018736521402994792, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.547060966491699, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.23280023038387299, "step": 8357 }, { "epoch": 0.12480308200001493, "grad_norm": 0.25390625, "grad_norm_var": 0.0017975489298502603, "learning_rate": 0.0001, "loss": 1.4421, "loss/crossentropy": 2.510587692260742, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.1998731940984726, "step": 8358 }, { "epoch": 0.12481801417062992, "grad_norm": 0.2119140625, "grad_norm_var": 0.001859597365061442, "learning_rate": 0.0001, "loss": 1.2502, "loss/crossentropy": 2.5663293600082397, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1681392416357994, "step": 8359 }, { "epoch": 0.1248329463412449, "grad_norm": 0.279296875, "grad_norm_var": 0.0018845200538635254, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.7590231895446777, "loss/fcd": 1.37109375, "loss/idx": 10.5, "loss/logits": 0.27260691672563553, "step": 8360 }, { "epoch": 0.12484787851185988, "grad_norm": 0.240234375, "grad_norm_var": 0.0018284956614176432, "learning_rate": 0.0001, "loss": 1.3816, "loss/crossentropy": 2.5437679290771484, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20584028959274292, "step": 8361 }, { "epoch": 0.12486281068247486, "grad_norm": 0.259765625, "grad_norm_var": 0.001831181844075521, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.4699372053146362, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2398800179362297, "step": 8362 }, { "epoch": 0.12487774285308983, "grad_norm": 0.3046875, "grad_norm_var": 0.0019320805867513021, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.4583003520965576, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.24177055805921555, "step": 8363 }, { "epoch": 0.12489267502370482, "grad_norm": 0.265625, "grad_norm_var": 0.0008151849110921224, "learning_rate": 0.0001, "loss": 1.4224, "loss/crossentropy": 2.6192545890808105, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.19582900404930115, "step": 8364 }, { "epoch": 0.1249076071943198, "grad_norm": 0.2578125, "grad_norm_var": 0.0006699879964192708, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.5893019437789917, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2018703818321228, "step": 8365 }, { "epoch": 0.12492253936493479, "grad_norm": 0.22265625, "grad_norm_var": 0.0006114602088928223, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.4090335369110107, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1990906074643135, "step": 8366 }, { "epoch": 0.12493747153554977, "grad_norm": 0.37890625, "grad_norm_var": 0.0016266981760660808, "learning_rate": 0.0001, "loss": 1.5137, "loss/crossentropy": 2.5856059789657593, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.19734155386686325, "step": 8367 }, { "epoch": 0.12495240370616474, "grad_norm": 0.2470703125, "grad_norm_var": 0.0015392939249674479, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.4631015062332153, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.184673972427845, "step": 8368 }, { "epoch": 0.12496733587677973, "grad_norm": 0.2412109375, "grad_norm_var": 0.00142668088277181, "learning_rate": 0.0001, "loss": 1.4749, "loss/crossentropy": 2.1807886958122253, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.21711231023073196, "step": 8369 }, { "epoch": 0.1249822680473947, "grad_norm": 0.255859375, "grad_norm_var": 0.0014233748118082682, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.6882169246673584, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.2053394615650177, "step": 8370 }, { "epoch": 0.1249972002180097, "grad_norm": 0.310546875, "grad_norm_var": 0.00157318115234375, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.5191773176193237, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.162245512008667, "step": 8371 }, { "epoch": 0.12501213238862469, "grad_norm": 0.240234375, "grad_norm_var": 0.0015921235084533692, "learning_rate": 0.0001, "loss": 1.2975, "loss/crossentropy": 2.82987642288208, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1841731071472168, "step": 8372 }, { "epoch": 0.12502706455923965, "grad_norm": 0.271484375, "grad_norm_var": 0.0015921235084533692, "learning_rate": 0.0001, "loss": 1.28, "loss/crossentropy": 2.443350076675415, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17448950558900833, "step": 8373 }, { "epoch": 0.12504199672985464, "grad_norm": 0.306640625, "grad_norm_var": 0.0016873955726623535, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.51673424243927, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18693893402814865, "step": 8374 }, { "epoch": 0.12505692890046963, "grad_norm": 0.2353515625, "grad_norm_var": 0.0015452980995178222, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.63717520236969, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18191654980182648, "step": 8375 }, { "epoch": 0.1250718610710846, "grad_norm": 0.28125, "grad_norm_var": 0.00154800017674764, "learning_rate": 0.0001, "loss": 1.2653, "loss/crossentropy": 2.5860464572906494, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1598452553153038, "step": 8376 }, { "epoch": 0.12508679324169958, "grad_norm": 0.2236328125, "grad_norm_var": 0.001631021499633789, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.5035566091537476, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.21932333707809448, "step": 8377 }, { "epoch": 0.12510172541231457, "grad_norm": 0.26171875, "grad_norm_var": 0.001628875732421875, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.448575496673584, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.16643384099006653, "step": 8378 }, { "epoch": 0.12511665758292953, "grad_norm": 0.25, "grad_norm_var": 0.0015558878580729167, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.5045523643493652, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19268658012151718, "step": 8379 }, { "epoch": 0.12513158975354452, "grad_norm": 0.2314453125, "grad_norm_var": 0.0016289035479227702, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.59182071685791, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.16636518388986588, "step": 8380 }, { "epoch": 0.1251465219241595, "grad_norm": 0.22265625, "grad_norm_var": 0.0017327586809794107, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.6549370288848877, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20164594799280167, "step": 8381 }, { "epoch": 0.1251614540947745, "grad_norm": 0.2490234375, "grad_norm_var": 0.0016403834025065104, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.434404134750366, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.21551372110843658, "step": 8382 }, { "epoch": 0.12517638626538946, "grad_norm": 0.2373046875, "grad_norm_var": 0.0007040937741597494, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.4250564575195312, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.22238729894161224, "step": 8383 }, { "epoch": 0.12519131843600445, "grad_norm": 0.2314453125, "grad_norm_var": 0.0007339755694071452, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.668563723564148, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.21537493914365768, "step": 8384 }, { "epoch": 0.12520625060661944, "grad_norm": 0.232421875, "grad_norm_var": 0.0007527510325113933, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.8316574096679688, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.18698804080486298, "step": 8385 }, { "epoch": 0.1252211827772344, "grad_norm": 0.2255859375, "grad_norm_var": 0.0007967273394266764, "learning_rate": 0.0001, "loss": 1.3651, "loss/crossentropy": 2.465950608253479, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19318388402462006, "step": 8386 }, { "epoch": 0.1252361149478494, "grad_norm": 0.2353515625, "grad_norm_var": 0.0005498091379801432, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.513322591781616, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.15204565227031708, "step": 8387 }, { "epoch": 0.12525104711846438, "grad_norm": 0.2294921875, "grad_norm_var": 0.0005652387936909993, "learning_rate": 0.0001, "loss": 1.3867, "loss/crossentropy": 2.56515109539032, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20701896399259567, "step": 8388 }, { "epoch": 0.12526597928907934, "grad_norm": 0.2373046875, "grad_norm_var": 0.0005189259847005208, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.7165313959121704, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18487867712974548, "step": 8389 }, { "epoch": 0.12528091145969433, "grad_norm": 0.2109375, "grad_norm_var": 0.0002813816070556641, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.3924695253372192, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17625483870506287, "step": 8390 }, { "epoch": 0.12529584363030932, "grad_norm": 0.2109375, "grad_norm_var": 0.0003245949745178223, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.691208600997925, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19585587084293365, "step": 8391 }, { "epoch": 0.1253107758009243, "grad_norm": 0.26171875, "grad_norm_var": 0.0002297043800354004, "learning_rate": 0.0001, "loss": 1.4089, "loss/crossentropy": 2.181430697441101, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.19403504580259323, "step": 8392 }, { "epoch": 0.12532570797153927, "grad_norm": 0.25, "grad_norm_var": 0.00023517608642578124, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.6883333921432495, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1961624100804329, "step": 8393 }, { "epoch": 0.12534064014215426, "grad_norm": 0.259765625, "grad_norm_var": 0.00022873878479003906, "learning_rate": 0.0001, "loss": 1.368, "loss/crossentropy": 2.540703773498535, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20391756296157837, "step": 8394 }, { "epoch": 0.12535557231276925, "grad_norm": 0.21484375, "grad_norm_var": 0.00024018287658691406, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.1453336477279663, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.1701049506664276, "step": 8395 }, { "epoch": 0.12537050448338422, "grad_norm": 0.404296875, "grad_norm_var": 0.002054083347320557, "learning_rate": 0.0001, "loss": 1.4463, "loss/crossentropy": 2.3833870887756348, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.20797647535800934, "step": 8396 }, { "epoch": 0.1253854366539992, "grad_norm": 0.419921875, "grad_norm_var": 0.003909869988759359, "learning_rate": 0.0001, "loss": 1.7441, "loss/crossentropy": 2.8210569620132446, "loss/fcd": 1.484375, "loss/idx": 10.5, "loss/logits": 0.2597169354557991, "step": 8397 }, { "epoch": 0.1254003688246142, "grad_norm": 0.21875, "grad_norm_var": 0.0039989312489827475, "learning_rate": 0.0001, "loss": 1.2584, "loss/crossentropy": 2.5618200302124023, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.1724449247121811, "step": 8398 }, { "epoch": 0.12541530099522918, "grad_norm": 0.2119140625, "grad_norm_var": 0.004099146525065104, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.5450295209884644, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19649042189121246, "step": 8399 }, { "epoch": 0.12543023316584415, "grad_norm": 0.224609375, "grad_norm_var": 0.004122094313303629, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.559924364089966, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19440855085849762, "step": 8400 }, { "epoch": 0.12544516533645914, "grad_norm": 0.283203125, "grad_norm_var": 0.004143997033437093, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.7678186893463135, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18565116077661514, "step": 8401 }, { "epoch": 0.12546009750707413, "grad_norm": 0.236328125, "grad_norm_var": 0.004107411702473958, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.5748265981674194, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.17590318620204926, "step": 8402 }, { "epoch": 0.1254750296776891, "grad_norm": 0.2333984375, "grad_norm_var": 0.004113245010375977, "learning_rate": 0.0001, "loss": 1.1943, "loss/crossentropy": 2.6800202131271362, "loss/fcd": 1.0390625, "loss/idx": 10.5, "loss/logits": 0.15527375787496567, "step": 8403 }, { "epoch": 0.12548996184830408, "grad_norm": 0.263671875, "grad_norm_var": 0.004062203566233317, "learning_rate": 0.0001, "loss": 1.4946, "loss/crossentropy": 2.6723525524139404, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.2328757420182228, "step": 8404 }, { "epoch": 0.12550489401891907, "grad_norm": 0.2275390625, "grad_norm_var": 0.00409621795018514, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.6888813972473145, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.19243353605270386, "step": 8405 }, { "epoch": 0.12551982618953403, "grad_norm": 0.20703125, "grad_norm_var": 0.004121808211008707, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.6466400623321533, "loss/fcd": 1.029296875, "loss/idx": 10.5, "loss/logits": 0.16287868469953537, "step": 8406 }, { "epoch": 0.12553475836014902, "grad_norm": 0.2373046875, "grad_norm_var": 0.003999821345011393, "learning_rate": 0.0001, "loss": 1.3349, "loss/crossentropy": 2.404104232788086, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1942351683974266, "step": 8407 }, { "epoch": 0.125549690530764, "grad_norm": 0.2421875, "grad_norm_var": 0.004018259048461914, "learning_rate": 0.0001, "loss": 1.3849, "loss/crossentropy": 2.81644868850708, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.21299909055233002, "step": 8408 }, { "epoch": 0.125564622701379, "grad_norm": 0.23046875, "grad_norm_var": 0.004064035415649414, "learning_rate": 0.0001, "loss": 1.3468, "loss/crossentropy": 2.545502781867981, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19055774807929993, "step": 8409 }, { "epoch": 0.12557955487199396, "grad_norm": 0.2314453125, "grad_norm_var": 0.004104483127593994, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.598615288734436, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17654547095298767, "step": 8410 }, { "epoch": 0.12559448704260895, "grad_norm": 0.2158203125, "grad_norm_var": 0.004099257787068685, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.425520181655884, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1925189420580864, "step": 8411 }, { "epoch": 0.12560941921322394, "grad_norm": 0.25, "grad_norm_var": 0.0025259017944335937, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.474208116531372, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21076399087905884, "step": 8412 }, { "epoch": 0.1256243513838389, "grad_norm": 0.224609375, "grad_norm_var": 0.0003769556681315104, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.436441421508789, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1804960072040558, "step": 8413 }, { "epoch": 0.1256392835544539, "grad_norm": 0.2314453125, "grad_norm_var": 0.0003618200620015462, "learning_rate": 0.0001, "loss": 1.3932, "loss/crossentropy": 2.8452833890914917, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2095685675740242, "step": 8414 }, { "epoch": 0.12565421572506888, "grad_norm": 0.263671875, "grad_norm_var": 0.0003738244374593099, "learning_rate": 0.0001, "loss": 1.296, "loss/crossentropy": 2.3731945753097534, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.16712382435798645, "step": 8415 }, { "epoch": 0.12566914789568387, "grad_norm": 0.251953125, "grad_norm_var": 0.0003729343414306641, "learning_rate": 0.0001, "loss": 1.5181, "loss/crossentropy": 2.401524782180786, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.20557286590337753, "step": 8416 }, { "epoch": 0.12568408006629883, "grad_norm": 0.25, "grad_norm_var": 0.0002478281656901042, "learning_rate": 0.0001, "loss": 1.4014, "loss/crossentropy": 2.3845787048339844, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.21004148572683334, "step": 8417 }, { "epoch": 0.12569901223691382, "grad_norm": 0.2236328125, "grad_norm_var": 0.00025955438613891604, "learning_rate": 0.0001, "loss": 1.3685, "loss/crossentropy": 2.666074752807617, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20443559437990189, "step": 8418 }, { "epoch": 0.1257139444075288, "grad_norm": 0.2255859375, "grad_norm_var": 0.00026661157608032227, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.6505569219589233, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17992984503507614, "step": 8419 }, { "epoch": 0.12572887657814377, "grad_norm": 0.248046875, "grad_norm_var": 0.00022426843643188476, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.2938499450683594, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.176188126206398, "step": 8420 }, { "epoch": 0.12574380874875876, "grad_norm": 0.2216796875, "grad_norm_var": 0.00023227930068969727, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.532732605934143, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18732839077711105, "step": 8421 }, { "epoch": 0.12575874091937375, "grad_norm": 0.234375, "grad_norm_var": 0.0001782059669494629, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.436761736869812, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.25330792367458344, "step": 8422 }, { "epoch": 0.1257736730899887, "grad_norm": 0.2138671875, "grad_norm_var": 0.00020967721939086915, "learning_rate": 0.0001, "loss": 1.2435, "loss/crossentropy": 2.5713542699813843, "loss/fcd": 1.068359375, "loss/idx": 10.5, "loss/logits": 0.17509758472442627, "step": 8423 }, { "epoch": 0.1257886052606037, "grad_norm": 0.28125, "grad_norm_var": 0.00034287373224894203, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.62760591506958, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19123157858848572, "step": 8424 }, { "epoch": 0.1258035374312187, "grad_norm": 0.23828125, "grad_norm_var": 0.00033950408299764, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6642781496047974, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.21221187710762024, "step": 8425 }, { "epoch": 0.12581846960183368, "grad_norm": 0.236328125, "grad_norm_var": 0.0003368218739827474, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.568209648132324, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20575908571481705, "step": 8426 }, { "epoch": 0.12583340177244864, "grad_norm": 0.27734375, "grad_norm_var": 0.0003901441891988119, "learning_rate": 0.0001, "loss": 1.2993, "loss/crossentropy": 2.7037177085876465, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1782272681593895, "step": 8427 }, { "epoch": 0.12584833394306363, "grad_norm": 0.22265625, "grad_norm_var": 0.00040772358576456703, "learning_rate": 0.0001, "loss": 1.2509, "loss/crossentropy": 2.7199326753616333, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1727445423603058, "step": 8428 }, { "epoch": 0.12586326611367862, "grad_norm": 0.2197265625, "grad_norm_var": 0.00041942596435546876, "learning_rate": 0.0001, "loss": 1.2404, "loss/crossentropy": 2.762976884841919, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.17005831748247147, "step": 8429 }, { "epoch": 0.12587819828429359, "grad_norm": 0.2431640625, "grad_norm_var": 0.0004146575927734375, "learning_rate": 0.0001, "loss": 1.3551, "loss/crossentropy": 2.5353479385375977, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1792692318558693, "step": 8430 }, { "epoch": 0.12589313045490858, "grad_norm": 0.26171875, "grad_norm_var": 0.0004089196523030599, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.5810532569885254, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.23448334634304047, "step": 8431 }, { "epoch": 0.12590806262552356, "grad_norm": 0.25390625, "grad_norm_var": 0.0004121144612630208, "learning_rate": 0.0001, "loss": 1.3932, "loss/crossentropy": 2.6058391332626343, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.20182491838932037, "step": 8432 }, { "epoch": 0.12592299479613855, "grad_norm": 0.26171875, "grad_norm_var": 0.00043519337972005207, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.3702399730682373, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.18826251477003098, "step": 8433 }, { "epoch": 0.12593792696675352, "grad_norm": 0.2578125, "grad_norm_var": 0.0004269878069559733, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.7280752658843994, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.18994690477848053, "step": 8434 }, { "epoch": 0.1259528591373685, "grad_norm": 0.2421875, "grad_norm_var": 0.00040435791015625, "learning_rate": 0.0001, "loss": 1.2898, "loss/crossentropy": 2.6296017169952393, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18046247959136963, "step": 8435 }, { "epoch": 0.1259677913079835, "grad_norm": 0.26171875, "grad_norm_var": 0.0004222710927327474, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.6033695936203003, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.17196330428123474, "step": 8436 }, { "epoch": 0.12598272347859846, "grad_norm": 0.5625, "grad_norm_var": 0.006600471337636312, "learning_rate": 0.0001, "loss": 1.8212, "loss/crossentropy": 2.474869966506958, "loss/fcd": 1.5, "loss/idx": 10.5, "loss/logits": 0.321165032684803, "step": 8437 }, { "epoch": 0.12599765564921345, "grad_norm": 0.2421875, "grad_norm_var": 0.0065705259641011555, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.549979090690613, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.18641410768032074, "step": 8438 }, { "epoch": 0.12601258781982844, "grad_norm": 0.314453125, "grad_norm_var": 0.00648662249247233, "learning_rate": 0.0001, "loss": 1.6322, "loss/crossentropy": 2.617988705635071, "loss/fcd": 1.35546875, "loss/idx": 10.5, "loss/logits": 0.2766813337802887, "step": 8439 }, { "epoch": 0.1260275199904434, "grad_norm": 0.23828125, "grad_norm_var": 0.006557957331339518, "learning_rate": 0.0001, "loss": 1.3956, "loss/crossentropy": 2.504966378211975, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.20033042132854462, "step": 8440 }, { "epoch": 0.1260424521610584, "grad_norm": 0.2421875, "grad_norm_var": 0.006541935602823893, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 2.863549590110779, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.22910535335540771, "step": 8441 }, { "epoch": 0.12605738433167338, "grad_norm": 0.255859375, "grad_norm_var": 0.006475178400675455, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.559895157814026, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19618114084005356, "step": 8442 }, { "epoch": 0.12607231650228837, "grad_norm": 0.2373046875, "grad_norm_var": 0.006548655033111572, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.635854721069336, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17263008654117584, "step": 8443 }, { "epoch": 0.12608724867290333, "grad_norm": 0.2578125, "grad_norm_var": 0.006404745578765869, "learning_rate": 0.0001, "loss": 1.5294, "loss/crossentropy": 2.5115593671798706, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.2402999997138977, "step": 8444 }, { "epoch": 0.12610218084351832, "grad_norm": 0.25, "grad_norm_var": 0.006250890096028646, "learning_rate": 0.0001, "loss": 1.5654, "loss/crossentropy": 2.6878165006637573, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.25685181468725204, "step": 8445 }, { "epoch": 0.1261171130141333, "grad_norm": 0.2255859375, "grad_norm_var": 0.006342299779256185, "learning_rate": 0.0001, "loss": 1.3944, "loss/crossentropy": 2.5942152738571167, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.2186010479927063, "step": 8446 }, { "epoch": 0.12613204518474827, "grad_norm": 0.248046875, "grad_norm_var": 0.006374231974283854, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.6772940158843994, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.21007559448480606, "step": 8447 }, { "epoch": 0.12614697735536326, "grad_norm": 0.2421875, "grad_norm_var": 0.006411043802897135, "learning_rate": 0.0001, "loss": 1.2555, "loss/crossentropy": 2.796460747718811, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.16959340125322342, "step": 8448 }, { "epoch": 0.12616190952597825, "grad_norm": 0.2412109375, "grad_norm_var": 0.006463364760080973, "learning_rate": 0.0001, "loss": 1.3544, "loss/crossentropy": 2.3659642934799194, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1825283169746399, "step": 8449 }, { "epoch": 0.1261768416965932, "grad_norm": 0.2275390625, "grad_norm_var": 0.006569671630859375, "learning_rate": 0.0001, "loss": 1.2613, "loss/crossentropy": 2.83469021320343, "loss/fcd": 1.083984375, "loss/idx": 10.5, "loss/logits": 0.1772800162434578, "step": 8450 }, { "epoch": 0.1261917738672082, "grad_norm": 0.2216796875, "grad_norm_var": 0.006666719913482666, "learning_rate": 0.0001, "loss": 1.2621, "loss/crossentropy": 2.514620065689087, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.16832908987998962, "step": 8451 }, { "epoch": 0.1262067060378232, "grad_norm": 0.21875, "grad_norm_var": 0.0068111379941304525, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.5673075914382935, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.19237347692251205, "step": 8452 }, { "epoch": 0.12622163820843818, "grad_norm": 0.232421875, "grad_norm_var": 0.00048786401748657227, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.7191386222839355, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18135231733322144, "step": 8453 }, { "epoch": 0.12623657037905314, "grad_norm": 0.234375, "grad_norm_var": 0.000493013858795166, "learning_rate": 0.0001, "loss": 1.1764, "loss/crossentropy": 2.5655171871185303, "loss/fcd": 1.03515625, "loss/idx": 10.5, "loss/logits": 0.14127486944198608, "step": 8454 }, { "epoch": 0.12625150254966813, "grad_norm": 0.228515625, "grad_norm_var": 0.00013564030329386392, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.717956304550171, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.20956839621067047, "step": 8455 }, { "epoch": 0.12626643472028312, "grad_norm": 0.2333984375, "grad_norm_var": 0.00013669331868489584, "learning_rate": 0.0001, "loss": 1.3659, "loss/crossentropy": 2.701449155807495, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19398163259029388, "step": 8456 }, { "epoch": 0.12628136689089808, "grad_norm": 0.236328125, "grad_norm_var": 0.0001350243886311849, "learning_rate": 0.0001, "loss": 1.3391, "loss/crossentropy": 2.7915308475494385, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.17899147421121597, "step": 8457 }, { "epoch": 0.12629629906151307, "grad_norm": 0.216796875, "grad_norm_var": 0.00013184547424316406, "learning_rate": 0.0001, "loss": 1.2697, "loss/crossentropy": 2.5610536336898804, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.17208168655633926, "step": 8458 }, { "epoch": 0.12631123123212806, "grad_norm": 0.2373046875, "grad_norm_var": 0.00013184547424316406, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.518908739089966, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20750117301940918, "step": 8459 }, { "epoch": 0.12632616340274305, "grad_norm": 0.259765625, "grad_norm_var": 0.00013815561930338542, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.5316044092178345, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.20414754003286362, "step": 8460 }, { "epoch": 0.12634109557335801, "grad_norm": 0.2353515625, "grad_norm_var": 0.00012152592341105143, "learning_rate": 0.0001, "loss": 1.3879, "loss/crossentropy": 2.788015127182007, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.21209152042865753, "step": 8461 }, { "epoch": 0.126356027743973, "grad_norm": 0.27734375, "grad_norm_var": 0.00023293495178222656, "learning_rate": 0.0001, "loss": 1.3004, "loss/crossentropy": 2.49200701713562, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.16755957901477814, "step": 8462 }, { "epoch": 0.126370959914588, "grad_norm": 0.30078125, "grad_norm_var": 0.0004848480224609375, "learning_rate": 0.0001, "loss": 1.5155, "loss/crossentropy": 2.6372960805892944, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.23031283915042877, "step": 8463 }, { "epoch": 0.12638589208520296, "grad_norm": 0.2451171875, "grad_norm_var": 0.000486147403717041, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.352324604988098, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18559188395738602, "step": 8464 }, { "epoch": 0.12640082425581795, "grad_norm": 0.220703125, "grad_norm_var": 0.000510263442993164, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.499518036842346, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20401161164045334, "step": 8465 }, { "epoch": 0.12641575642643293, "grad_norm": 0.26953125, "grad_norm_var": 0.0005555431048075358, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.766141653060913, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19848840683698654, "step": 8466 }, { "epoch": 0.1264306885970479, "grad_norm": 0.279296875, "grad_norm_var": 0.0006087621053059896, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.7462639808654785, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.22271570563316345, "step": 8467 }, { "epoch": 0.1264456207676629, "grad_norm": 0.25, "grad_norm_var": 0.000558916727701823, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.4440733194351196, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18262051790952682, "step": 8468 }, { "epoch": 0.12646055293827788, "grad_norm": 0.2421875, "grad_norm_var": 0.0005454858144124349, "learning_rate": 0.0001, "loss": 1.3072, "loss/crossentropy": 2.7522940635681152, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.19000756740570068, "step": 8469 }, { "epoch": 0.12647548510889287, "grad_norm": 0.251953125, "grad_norm_var": 0.0005330403645833333, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.4937840700149536, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.2010103240609169, "step": 8470 }, { "epoch": 0.12649041727950783, "grad_norm": 0.22265625, "grad_norm_var": 0.0005512078603108724, "learning_rate": 0.0001, "loss": 1.2736, "loss/crossentropy": 2.574791431427002, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.1720665544271469, "step": 8471 }, { "epoch": 0.12650534945012282, "grad_norm": 0.251953125, "grad_norm_var": 0.000534975528717041, "learning_rate": 0.0001, "loss": 1.2678, "loss/crossentropy": 2.589767098426819, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.174020454287529, "step": 8472 }, { "epoch": 0.1265202816207378, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005747318267822265, "learning_rate": 0.0001, "loss": 1.2701, "loss/crossentropy": 2.599360227584839, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.19197092950344086, "step": 8473 }, { "epoch": 0.12653521379135277, "grad_norm": 0.2392578125, "grad_norm_var": 0.0005101164182027181, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.8007251024246216, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.19260653853416443, "step": 8474 }, { "epoch": 0.12655014596196776, "grad_norm": 0.20703125, "grad_norm_var": 0.0006198724110921224, "learning_rate": 0.0001, "loss": 1.2302, "loss/crossentropy": 2.683679461479187, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.16773760318756104, "step": 8475 }, { "epoch": 0.12656507813258275, "grad_norm": 0.267578125, "grad_norm_var": 0.0006355126698811849, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.919873833656311, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2122495323419571, "step": 8476 }, { "epoch": 0.12658001030319774, "grad_norm": 0.2353515625, "grad_norm_var": 0.0006355126698811849, "learning_rate": 0.0001, "loss": 1.3212, "loss/crossentropy": 2.9065253734588623, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.196236290037632, "step": 8477 }, { "epoch": 0.1265949424738127, "grad_norm": 0.255859375, "grad_norm_var": 0.0005828857421875, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.783403754234314, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1906513199210167, "step": 8478 }, { "epoch": 0.1266098746444277, "grad_norm": 0.2412109375, "grad_norm_var": 0.0003819425900777181, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.5044206380844116, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17332954704761505, "step": 8479 }, { "epoch": 0.12662480681504268, "grad_norm": 0.234375, "grad_norm_var": 0.0003873189290364583, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.3542126417160034, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.22290990501642227, "step": 8480 }, { "epoch": 0.12663973898565764, "grad_norm": 0.24609375, "grad_norm_var": 0.00035157203674316404, "learning_rate": 0.0001, "loss": 1.3165, "loss/crossentropy": 2.6811983585357666, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18368180841207504, "step": 8481 }, { "epoch": 0.12665467115627263, "grad_norm": 0.255859375, "grad_norm_var": 0.0003180821736653646, "learning_rate": 0.0001, "loss": 1.4827, "loss/crossentropy": 2.7119728326797485, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.22486713528633118, "step": 8482 }, { "epoch": 0.12666960332688762, "grad_norm": 0.263671875, "grad_norm_var": 0.00025959014892578124, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.483261227607727, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20365940779447556, "step": 8483 }, { "epoch": 0.12668453549750258, "grad_norm": 0.244140625, "grad_norm_var": 0.00025620460510253904, "learning_rate": 0.0001, "loss": 1.336, "loss/crossentropy": 2.6740169525146484, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19142049551010132, "step": 8484 }, { "epoch": 0.12669946766811757, "grad_norm": 0.22265625, "grad_norm_var": 0.00028100013732910154, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.432507872581482, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1846570298075676, "step": 8485 }, { "epoch": 0.12671439983873256, "grad_norm": 0.2001953125, "grad_norm_var": 0.0003751397132873535, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.6401230096817017, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.17010219395160675, "step": 8486 }, { "epoch": 0.12672933200934755, "grad_norm": 0.2373046875, "grad_norm_var": 0.00035839080810546876, "learning_rate": 0.0001, "loss": 1.3139, "loss/crossentropy": 2.579011559486389, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1849878579378128, "step": 8487 }, { "epoch": 0.1267442641799625, "grad_norm": 0.267578125, "grad_norm_var": 0.0004006067911783854, "learning_rate": 0.0001, "loss": 1.3838, "loss/crossentropy": 2.557510256767273, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2040962129831314, "step": 8488 }, { "epoch": 0.1267591963505775, "grad_norm": 0.2265625, "grad_norm_var": 0.00039017597834269205, "learning_rate": 0.0001, "loss": 1.528, "loss/crossentropy": 2.2583199739456177, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.23503204435110092, "step": 8489 }, { "epoch": 0.1267741285211925, "grad_norm": 0.232421875, "grad_norm_var": 0.0003940423329671224, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.52382755279541, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19200849533081055, "step": 8490 }, { "epoch": 0.12678906069180745, "grad_norm": 0.27734375, "grad_norm_var": 0.0003951867421468099, "learning_rate": 0.0001, "loss": 1.4477, "loss/crossentropy": 2.453027129173279, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.20940230041742325, "step": 8491 }, { "epoch": 0.12680399286242244, "grad_norm": 0.24609375, "grad_norm_var": 0.00035724639892578127, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.3412410020828247, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.18651682883501053, "step": 8492 }, { "epoch": 0.12681892503303743, "grad_norm": 0.271484375, "grad_norm_var": 0.00040238300959269206, "learning_rate": 0.0001, "loss": 1.4934, "loss/crossentropy": 2.386872410774231, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.20430083572864532, "step": 8493 }, { "epoch": 0.12683385720365242, "grad_norm": 0.240234375, "grad_norm_var": 0.00039538939793904625, "learning_rate": 0.0001, "loss": 1.4333, "loss/crossentropy": 2.1785467863082886, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2106284201145172, "step": 8494 }, { "epoch": 0.12684878937426738, "grad_norm": 0.2216796875, "grad_norm_var": 0.0004270195960998535, "learning_rate": 0.0001, "loss": 1.2728, "loss/crossentropy": 2.563081979751587, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.18300028145313263, "step": 8495 }, { "epoch": 0.12686372154488237, "grad_norm": 0.2216796875, "grad_norm_var": 0.00045166015625, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.596353054046631, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1872180700302124, "step": 8496 }, { "epoch": 0.12687865371549736, "grad_norm": 0.25390625, "grad_norm_var": 0.0004595438639322917, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.5126789808273315, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1742897853255272, "step": 8497 }, { "epoch": 0.12689358588611233, "grad_norm": 0.291015625, "grad_norm_var": 0.0005985895792643229, "learning_rate": 0.0001, "loss": 1.4953, "loss/crossentropy": 2.420375347137451, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.21792086958885193, "step": 8498 }, { "epoch": 0.12690851805672732, "grad_norm": 0.232421875, "grad_norm_var": 0.0005812962849934896, "learning_rate": 0.0001, "loss": 1.3496, "loss/crossentropy": 2.424531579017639, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18162761628627777, "step": 8499 }, { "epoch": 0.1269234502273423, "grad_norm": 0.29296875, "grad_norm_var": 0.0007382551829020182, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.421039581298828, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.19180059432983398, "step": 8500 }, { "epoch": 0.12693838239795727, "grad_norm": 0.2177734375, "grad_norm_var": 0.0007549246152242024, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.490375518798828, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1715405359864235, "step": 8501 }, { "epoch": 0.12695331456857226, "grad_norm": 0.208984375, "grad_norm_var": 0.0007064660390218099, "learning_rate": 0.0001, "loss": 1.2509, "loss/crossentropy": 2.5155304670333862, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1689060479402542, "step": 8502 }, { "epoch": 0.12696824673918725, "grad_norm": 0.2294921875, "grad_norm_var": 0.0007195631663004557, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.7117968797683716, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.17033204436302185, "step": 8503 }, { "epoch": 0.12698317890980224, "grad_norm": 0.26171875, "grad_norm_var": 0.0007046381632486979, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.7102948427200317, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19791512191295624, "step": 8504 }, { "epoch": 0.1269981110804172, "grad_norm": 0.265625, "grad_norm_var": 0.0007020950317382813, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.413556456565857, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.2091570869088173, "step": 8505 }, { "epoch": 0.1270130432510322, "grad_norm": 0.232421875, "grad_norm_var": 0.0007020950317382813, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.7679249048233032, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1863333359360695, "step": 8506 }, { "epoch": 0.12702797542164718, "grad_norm": 0.265625, "grad_norm_var": 0.000664520263671875, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.294028401374817, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.2169056013226509, "step": 8507 }, { "epoch": 0.12704290759226214, "grad_norm": 0.23046875, "grad_norm_var": 0.0006818135579427083, "learning_rate": 0.0001, "loss": 1.4198, "loss/crossentropy": 2.5214295387268066, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.22838714718818665, "step": 8508 }, { "epoch": 0.12705783976287713, "grad_norm": 0.232421875, "grad_norm_var": 0.0006449381510416667, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.7073439359664917, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19536203145980835, "step": 8509 }, { "epoch": 0.12707277193349212, "grad_norm": 0.390625, "grad_norm_var": 0.0019899845123291016, "learning_rate": 0.0001, "loss": 1.5633, "loss/crossentropy": 2.629412531852722, "loss/fcd": 1.3203125, "loss/idx": 10.5, "loss/logits": 0.242996446788311, "step": 8510 }, { "epoch": 0.12708770410410708, "grad_norm": 0.28125, "grad_norm_var": 0.001962594191233317, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.8067400455474854, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.2302684336900711, "step": 8511 }, { "epoch": 0.12710263627472207, "grad_norm": 0.23828125, "grad_norm_var": 0.0019021352132161458, "learning_rate": 0.0001, "loss": 1.3212, "loss/crossentropy": 2.4246126413345337, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19615702331066132, "step": 8512 }, { "epoch": 0.12711756844533706, "grad_norm": 0.359375, "grad_norm_var": 0.002542432149251302, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.6489747762680054, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.2089589759707451, "step": 8513 }, { "epoch": 0.12713250061595205, "grad_norm": 0.267578125, "grad_norm_var": 0.002493604024251302, "learning_rate": 0.0001, "loss": 1.4698, "loss/crossentropy": 2.4449251890182495, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.21978987753391266, "step": 8514 }, { "epoch": 0.127147432786567, "grad_norm": 0.21484375, "grad_norm_var": 0.002584441502888997, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.661946415901184, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.20812320709228516, "step": 8515 }, { "epoch": 0.127162364957182, "grad_norm": 0.236328125, "grad_norm_var": 0.0025498708089192706, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.772558093070984, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20880873501300812, "step": 8516 }, { "epoch": 0.127177297127797, "grad_norm": 0.21875, "grad_norm_var": 0.0025446534156799315, "learning_rate": 0.0001, "loss": 1.2756, "loss/crossentropy": 2.6201006174087524, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.17796514928340912, "step": 8517 }, { "epoch": 0.12719222929841195, "grad_norm": 0.21875, "grad_norm_var": 0.0024863203366597493, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.3850637674331665, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19862299412488937, "step": 8518 }, { "epoch": 0.12720716146902694, "grad_norm": 0.2353515625, "grad_norm_var": 0.0024654348691304524, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.6522282361984253, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1760876178741455, "step": 8519 }, { "epoch": 0.12722209363964193, "grad_norm": 0.251953125, "grad_norm_var": 0.002468295892079671, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.776502251625061, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.24161626398563385, "step": 8520 }, { "epoch": 0.12723702581025692, "grad_norm": 0.2431640625, "grad_norm_var": 0.0024791717529296874, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.6041345596313477, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1795816272497177, "step": 8521 }, { "epoch": 0.12725195798087188, "grad_norm": 0.236328125, "grad_norm_var": 0.0024671554565429688, "learning_rate": 0.0001, "loss": 1.4398, "loss/crossentropy": 2.5723323822021484, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.23278231918811798, "step": 8522 }, { "epoch": 0.12726689015148687, "grad_norm": 0.240234375, "grad_norm_var": 0.002480173110961914, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.7132668495178223, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20212603360414505, "step": 8523 }, { "epoch": 0.12728182232210186, "grad_norm": 0.2578125, "grad_norm_var": 0.002433888117472331, "learning_rate": 0.0001, "loss": 1.5659, "loss/crossentropy": 2.499822735786438, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2611989229917526, "step": 8524 }, { "epoch": 0.12729675449271682, "grad_norm": 0.33203125, "grad_norm_var": 0.0027184168497721354, "learning_rate": 0.0001, "loss": 1.6542, "loss/crossentropy": 2.4352954030036926, "loss/fcd": 1.3828125, "loss/idx": 10.5, "loss/logits": 0.2713760584592819, "step": 8525 }, { "epoch": 0.1273116866633318, "grad_norm": 0.232421875, "grad_norm_var": 0.0016099135080973306, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.5002193450927734, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19161398708820343, "step": 8526 }, { "epoch": 0.1273266188339468, "grad_norm": 0.30078125, "grad_norm_var": 0.0017046451568603516, "learning_rate": 0.0001, "loss": 1.5903, "loss/crossentropy": 2.8471643924713135, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.2660604938864708, "step": 8527 }, { "epoch": 0.12734155100456176, "grad_norm": 0.2314453125, "grad_norm_var": 0.0017230312029520671, "learning_rate": 0.0001, "loss": 1.2845, "loss/crossentropy": 2.6462355852127075, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.1829759031534195, "step": 8528 }, { "epoch": 0.12735648317517675, "grad_norm": 0.28125, "grad_norm_var": 0.0010154048601786296, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.8643723726272583, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19077249616384506, "step": 8529 }, { "epoch": 0.12737141534579174, "grad_norm": 0.224609375, "grad_norm_var": 0.0010297417640686036, "learning_rate": 0.0001, "loss": 1.289, "loss/crossentropy": 2.542539954185486, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17180554568767548, "step": 8530 }, { "epoch": 0.12738634751640673, "grad_norm": 0.19921875, "grad_norm_var": 0.001112520694732666, "learning_rate": 0.0001, "loss": 1.2253, "loss/crossentropy": 2.5602229833602905, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.16665849089622498, "step": 8531 }, { "epoch": 0.1274012796870217, "grad_norm": 0.2451171875, "grad_norm_var": 0.0011056900024414063, "learning_rate": 0.0001, "loss": 1.4477, "loss/crossentropy": 2.4991031885147095, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.2016019970178604, "step": 8532 }, { "epoch": 0.12741621185763669, "grad_norm": 0.236328125, "grad_norm_var": 0.0010591983795166016, "learning_rate": 0.0001, "loss": 1.3367, "loss/crossentropy": 2.7453867197036743, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1882609874010086, "step": 8533 }, { "epoch": 0.12743114402825167, "grad_norm": 0.25390625, "grad_norm_var": 0.0009996891021728516, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.4424021244049072, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2122187614440918, "step": 8534 }, { "epoch": 0.12744607619886664, "grad_norm": 0.2412109375, "grad_norm_var": 0.00099029541015625, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.540386915206909, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.1887686401605606, "step": 8535 }, { "epoch": 0.12746100836948163, "grad_norm": 0.244140625, "grad_norm_var": 0.000992584228515625, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.7949846982955933, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18270686268806458, "step": 8536 }, { "epoch": 0.12747594054009662, "grad_norm": 0.251953125, "grad_norm_var": 0.0009894013404846192, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.4033615589141846, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20312731713056564, "step": 8537 }, { "epoch": 0.1274908727107116, "grad_norm": 0.234375, "grad_norm_var": 0.000993343194325765, "learning_rate": 0.0001, "loss": 1.354, "loss/crossentropy": 2.3805168867111206, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18602095544338226, "step": 8538 }, { "epoch": 0.12750580488132657, "grad_norm": 0.259765625, "grad_norm_var": 0.0009906411170959473, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.3457683324813843, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17056211084127426, "step": 8539 }, { "epoch": 0.12752073705194156, "grad_norm": 0.259765625, "grad_norm_var": 0.0009924848874409995, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.768036127090454, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.2623632848262787, "step": 8540 }, { "epoch": 0.12753566922255655, "grad_norm": 0.2470703125, "grad_norm_var": 0.0005344231923421224, "learning_rate": 0.0001, "loss": 1.3145, "loss/crossentropy": 2.33001446723938, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18169302493333817, "step": 8541 }, { "epoch": 0.1275506013931715, "grad_norm": 0.2294921875, "grad_norm_var": 0.0005404432614644368, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.5731959342956543, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17896100878715515, "step": 8542 }, { "epoch": 0.1275655335637865, "grad_norm": 0.72265625, "grad_norm_var": 0.014729972680409749, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.1224491596221924, "loss/fcd": 1.390625, "loss/idx": 10.5, "loss/logits": 0.17333680391311646, "step": 8543 }, { "epoch": 0.1275804657344015, "grad_norm": 0.236328125, "grad_norm_var": 0.014704640706380208, "learning_rate": 0.0001, "loss": 1.4036, "loss/crossentropy": 2.7835495471954346, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21606861799955368, "step": 8544 }, { "epoch": 0.12759539790501645, "grad_norm": 0.255859375, "grad_norm_var": 0.014716831843058269, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.3427305221557617, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.1986471712589264, "step": 8545 }, { "epoch": 0.12761033007563144, "grad_norm": 0.248046875, "grad_norm_var": 0.014605061213175455, "learning_rate": 0.0001, "loss": 1.5219, "loss/crossentropy": 2.326300859451294, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.22897008806467056, "step": 8546 }, { "epoch": 0.12762526224624643, "grad_norm": 0.296875, "grad_norm_var": 0.01424266497294108, "learning_rate": 0.0001, "loss": 1.5229, "loss/crossentropy": 2.427298069000244, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.2221175581216812, "step": 8547 }, { "epoch": 0.12764019441686142, "grad_norm": 0.236328125, "grad_norm_var": 0.014287118117014568, "learning_rate": 0.0001, "loss": 1.3053, "loss/crossentropy": 2.7114555835723877, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1919962391257286, "step": 8548 }, { "epoch": 0.12765512658747638, "grad_norm": 0.28125, "grad_norm_var": 0.014161360263824464, "learning_rate": 0.0001, "loss": 1.5192, "loss/crossentropy": 2.500778079032898, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.25357890129089355, "step": 8549 }, { "epoch": 0.12767005875809137, "grad_norm": 0.267578125, "grad_norm_var": 0.014123308658599853, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.635876774787903, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.23197253048419952, "step": 8550 }, { "epoch": 0.12768499092870636, "grad_norm": 0.2392578125, "grad_norm_var": 0.014134180545806885, "learning_rate": 0.0001, "loss": 1.3683, "loss/crossentropy": 2.3706743717193604, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20031056553125381, "step": 8551 }, { "epoch": 0.12769992309932132, "grad_norm": 0.224609375, "grad_norm_var": 0.014256409804026286, "learning_rate": 0.0001, "loss": 1.2655, "loss/crossentropy": 2.601456642150879, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.17172672599554062, "step": 8552 }, { "epoch": 0.1277148552699363, "grad_norm": 0.263671875, "grad_norm_var": 0.014220074812571207, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.6788665056228638, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.19280841946601868, "step": 8553 }, { "epoch": 0.1277297874405513, "grad_norm": 0.216796875, "grad_norm_var": 0.014349679152170816, "learning_rate": 0.0001, "loss": 1.2219, "loss/crossentropy": 2.545336604118347, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.16332778334617615, "step": 8554 }, { "epoch": 0.1277447196111663, "grad_norm": 0.22265625, "grad_norm_var": 0.014537521203358968, "learning_rate": 0.0001, "loss": 1.2885, "loss/crossentropy": 2.550508499145508, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18308014422655106, "step": 8555 }, { "epoch": 0.12775965178178125, "grad_norm": 0.2255859375, "grad_norm_var": 0.014693705240885417, "learning_rate": 0.0001, "loss": 1.3105, "loss/crossentropy": 2.7695679664611816, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17773247510194778, "step": 8556 }, { "epoch": 0.12777458395239624, "grad_norm": 0.2412109375, "grad_norm_var": 0.014718357721964519, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 2.6993560791015625, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21053100377321243, "step": 8557 }, { "epoch": 0.12778951612301123, "grad_norm": 0.419921875, "grad_norm_var": 0.015816334883371988, "learning_rate": 0.0001, "loss": 1.5942, "loss/crossentropy": 2.6255128383636475, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.2777976766228676, "step": 8558 }, { "epoch": 0.1278044482936262, "grad_norm": 0.2431640625, "grad_norm_var": 0.002359882990519206, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.393702268600464, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19708547741174698, "step": 8559 }, { "epoch": 0.12781938046424118, "grad_norm": 0.396484375, "grad_norm_var": 0.0035120487213134766, "learning_rate": 0.0001, "loss": 1.8469, "loss/crossentropy": 2.4305427074432373, "loss/fcd": 1.578125, "loss/idx": 10.5, "loss/logits": 0.26882296055555344, "step": 8560 }, { "epoch": 0.12783431263485617, "grad_norm": 0.24609375, "grad_norm_var": 0.0035331090291341144, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.465060591697693, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.20856792479753494, "step": 8561 }, { "epoch": 0.12784924480547114, "grad_norm": 0.25, "grad_norm_var": 0.003528451919555664, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.6809805631637573, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.1641324833035469, "step": 8562 }, { "epoch": 0.12786417697608612, "grad_norm": 0.2392578125, "grad_norm_var": 0.003506179650624593, "learning_rate": 0.0001, "loss": 1.2972, "loss/crossentropy": 2.597447633743286, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1917405128479004, "step": 8563 }, { "epoch": 0.12787910914670111, "grad_norm": 0.248046875, "grad_norm_var": 0.0034725149472554524, "learning_rate": 0.0001, "loss": 1.3814, "loss/crossentropy": 2.664630889892578, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.22122998535633087, "step": 8564 }, { "epoch": 0.1278940413173161, "grad_norm": 0.255859375, "grad_norm_var": 0.003454744815826416, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.8670397996902466, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16751247644424438, "step": 8565 }, { "epoch": 0.12790897348793107, "grad_norm": 0.298828125, "grad_norm_var": 0.0035368879636128742, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.4826197624206543, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.20270900428295135, "step": 8566 }, { "epoch": 0.12792390565854606, "grad_norm": 0.2236328125, "grad_norm_var": 0.0036046624183654783, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.55198335647583, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.18988899886608124, "step": 8567 }, { "epoch": 0.12793883782916105, "grad_norm": 0.2392578125, "grad_norm_var": 0.0035421371459960936, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.439581036567688, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1702074632048607, "step": 8568 }, { "epoch": 0.127953769999776, "grad_norm": 0.267578125, "grad_norm_var": 0.0035427093505859377, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.50342059135437, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.22937516123056412, "step": 8569 }, { "epoch": 0.127968702170391, "grad_norm": 0.24609375, "grad_norm_var": 0.0034094333648681642, "learning_rate": 0.0001, "loss": 1.3544, "loss/crossentropy": 2.899514317512512, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1903155893087387, "step": 8570 }, { "epoch": 0.127983634341006, "grad_norm": 0.25390625, "grad_norm_var": 0.0032878716786702475, "learning_rate": 0.0001, "loss": 1.3584, "loss/crossentropy": 2.7777212858200073, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.1904641091823578, "step": 8571 }, { "epoch": 0.12799856651162095, "grad_norm": 0.224609375, "grad_norm_var": 0.0032935102780659994, "learning_rate": 0.0001, "loss": 1.2887, "loss/crossentropy": 2.563019633293152, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17928991466760635, "step": 8572 }, { "epoch": 0.12801349868223594, "grad_norm": 0.251953125, "grad_norm_var": 0.003261820475260417, "learning_rate": 0.0001, "loss": 1.3686, "loss/crossentropy": 2.6529359817504883, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19281208515167236, "step": 8573 }, { "epoch": 0.12802843085285093, "grad_norm": 0.271484375, "grad_norm_var": 0.0016527811686197916, "learning_rate": 0.0001, "loss": 1.3256, "loss/crossentropy": 2.884620785713196, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18883607536554337, "step": 8574 }, { "epoch": 0.12804336302346592, "grad_norm": 0.2421875, "grad_norm_var": 0.0016550024350484212, "learning_rate": 0.0001, "loss": 1.3125, "loss/crossentropy": 2.69180428981781, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18361171334981918, "step": 8575 }, { "epoch": 0.12805829519408088, "grad_norm": 0.267578125, "grad_norm_var": 0.0003426512082417806, "learning_rate": 0.0001, "loss": 1.4366, "loss/crossentropy": 2.6226890087127686, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.21397250890731812, "step": 8576 }, { "epoch": 0.12807322736469587, "grad_norm": 0.2470703125, "grad_norm_var": 0.00034198760986328127, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.712957262992859, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19852062314748764, "step": 8577 }, { "epoch": 0.12808815953531086, "grad_norm": 0.71484375, "grad_norm_var": 0.013741048177083333, "learning_rate": 0.0001, "loss": 1.5082, "loss/crossentropy": 2.4054635763168335, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.21520142257213593, "step": 8578 }, { "epoch": 0.12810309170592582, "grad_norm": 0.2392578125, "grad_norm_var": 0.013741048177083333, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.6239370107650757, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.16911545395851135, "step": 8579 }, { "epoch": 0.1281180238765408, "grad_norm": 0.259765625, "grad_norm_var": 0.013698514302571614, "learning_rate": 0.0001, "loss": 1.2944, "loss/crossentropy": 2.6457918882369995, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17716865241527557, "step": 8580 }, { "epoch": 0.1281329560471558, "grad_norm": 0.330078125, "grad_norm_var": 0.013789113362630208, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.562908411026001, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2613350376486778, "step": 8581 }, { "epoch": 0.1281478882177708, "grad_norm": 0.275390625, "grad_norm_var": 0.013783772786458334, "learning_rate": 0.0001, "loss": 1.4472, "loss/crossentropy": 2.6804821491241455, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2245389148592949, "step": 8582 }, { "epoch": 0.12816282038838575, "grad_norm": 0.275390625, "grad_norm_var": 0.013529996077219645, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.5744311809539795, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.18730425834655762, "step": 8583 }, { "epoch": 0.12817775255900074, "grad_norm": 0.263671875, "grad_norm_var": 0.013408899307250977, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.5674169063568115, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21715512871742249, "step": 8584 }, { "epoch": 0.12819268472961573, "grad_norm": 0.23828125, "grad_norm_var": 0.013547897338867188, "learning_rate": 0.0001, "loss": 1.3169, "loss/crossentropy": 2.880169630050659, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1919473633170128, "step": 8585 }, { "epoch": 0.1282076169002307, "grad_norm": 0.255859375, "grad_norm_var": 0.013499816258748373, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.772332549095154, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18495575338602066, "step": 8586 }, { "epoch": 0.12822254907084568, "grad_norm": 0.431640625, "grad_norm_var": 0.014661280314127605, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.4487528800964355, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.26691165566444397, "step": 8587 }, { "epoch": 0.12823748124146067, "grad_norm": 0.25, "grad_norm_var": 0.01444865862528483, "learning_rate": 0.0001, "loss": 1.2556, "loss/crossentropy": 2.722493529319763, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.16579513251781464, "step": 8588 }, { "epoch": 0.12825241341207563, "grad_norm": 0.33203125, "grad_norm_var": 0.014326794942220052, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.5800914764404297, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.25704339146614075, "step": 8589 }, { "epoch": 0.12826734558269062, "grad_norm": 0.275390625, "grad_norm_var": 0.01430981953938802, "learning_rate": 0.0001, "loss": 1.3967, "loss/crossentropy": 2.7101305723190308, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2091928794980049, "step": 8590 }, { "epoch": 0.1282822777533056, "grad_norm": 0.21484375, "grad_norm_var": 0.014589754740397136, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.718683123588562, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18101013451814651, "step": 8591 }, { "epoch": 0.1282972099239206, "grad_norm": 0.345703125, "grad_norm_var": 0.014587211608886718, "learning_rate": 0.0001, "loss": 1.6908, "loss/crossentropy": 2.3862355947494507, "loss/fcd": 1.41796875, "loss/idx": 10.5, "loss/logits": 0.27279970794916153, "step": 8592 }, { "epoch": 0.12831214209453556, "grad_norm": 0.2216796875, "grad_norm_var": 0.014838266372680663, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.3342537879943848, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.19850771874189377, "step": 8593 }, { "epoch": 0.12832707426515055, "grad_norm": 0.279296875, "grad_norm_var": 0.003052838643391927, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.7432188987731934, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.21458837389945984, "step": 8594 }, { "epoch": 0.12834200643576554, "grad_norm": 0.33203125, "grad_norm_var": 0.0030803958574930827, "learning_rate": 0.0001, "loss": 1.661, "loss/crossentropy": 2.5560413599014282, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.30167364329099655, "step": 8595 }, { "epoch": 0.1283569386063805, "grad_norm": 0.265625, "grad_norm_var": 0.003061799208323161, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.522671937942505, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.16131233423948288, "step": 8596 }, { "epoch": 0.1283718707769955, "grad_norm": 0.240234375, "grad_norm_var": 0.0030464450518290203, "learning_rate": 0.0001, "loss": 1.2547, "loss/crossentropy": 2.6425825357437134, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1648997813463211, "step": 8597 }, { "epoch": 0.12838680294761048, "grad_norm": 0.396484375, "grad_norm_var": 0.003871277968088786, "learning_rate": 0.0001, "loss": 1.4857, "loss/crossentropy": 2.5247840881347656, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.18881207704544067, "step": 8598 }, { "epoch": 0.12840173511822547, "grad_norm": 0.2158203125, "grad_norm_var": 0.004198265075683594, "learning_rate": 0.0001, "loss": 1.3769, "loss/crossentropy": 2.40364408493042, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.1698870062828064, "step": 8599 }, { "epoch": 0.12841666728884044, "grad_norm": 0.298828125, "grad_norm_var": 0.004175949096679688, "learning_rate": 0.0001, "loss": 1.479, "loss/crossentropy": 2.4048609733581543, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.1938740611076355, "step": 8600 }, { "epoch": 0.12843159945945543, "grad_norm": 0.26953125, "grad_norm_var": 0.004033533732096354, "learning_rate": 0.0001, "loss": 1.3112, "loss/crossentropy": 2.7332602739334106, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17839881777763367, "step": 8601 }, { "epoch": 0.12844653163007042, "grad_norm": 0.2294921875, "grad_norm_var": 0.004193715254465739, "learning_rate": 0.0001, "loss": 1.2968, "loss/crossentropy": 2.6635724306106567, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1913808211684227, "step": 8602 }, { "epoch": 0.12846146380068538, "grad_norm": 0.2314453125, "grad_norm_var": 0.00284881591796875, "learning_rate": 0.0001, "loss": 1.3168, "loss/crossentropy": 2.498944878578186, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18395984172821045, "step": 8603 }, { "epoch": 0.12847639597130037, "grad_norm": 0.2353515625, "grad_norm_var": 0.0029108643531799315, "learning_rate": 0.0001, "loss": 1.295, "loss/crossentropy": 2.3632861375808716, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17386674880981445, "step": 8604 }, { "epoch": 0.12849132814191536, "grad_norm": 0.2177734375, "grad_norm_var": 0.0028425216674804687, "learning_rate": 0.0001, "loss": 1.3217, "loss/crossentropy": 2.667191505432129, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1927792802453041, "step": 8605 }, { "epoch": 0.12850626031253032, "grad_norm": 0.2470703125, "grad_norm_var": 0.002860383192698161, "learning_rate": 0.0001, "loss": 1.5754, "loss/crossentropy": 2.502941131591797, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.27853551506996155, "step": 8606 }, { "epoch": 0.1285211924831453, "grad_norm": 0.232421875, "grad_norm_var": 0.0027619640032450357, "learning_rate": 0.0001, "loss": 1.3056, "loss/crossentropy": 2.5130362510681152, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17274945974349976, "step": 8607 }, { "epoch": 0.1285361246537603, "grad_norm": 0.419921875, "grad_norm_var": 0.00389324426651001, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.2796595692634583, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.1742628514766693, "step": 8608 }, { "epoch": 0.1285510568243753, "grad_norm": 0.2373046875, "grad_norm_var": 0.003806142012278239, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.496807336807251, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20250064879655838, "step": 8609 }, { "epoch": 0.12856598899499025, "grad_norm": 0.2177734375, "grad_norm_var": 0.0039811293284098305, "learning_rate": 0.0001, "loss": 1.3174, "loss/crossentropy": 2.528532028198242, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18067124485969543, "step": 8610 }, { "epoch": 0.12858092116560524, "grad_norm": 0.279296875, "grad_norm_var": 0.003704325358072917, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.8547571897506714, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.2027876228094101, "step": 8611 }, { "epoch": 0.12859585333622023, "grad_norm": 0.2890625, "grad_norm_var": 0.0037417093912760417, "learning_rate": 0.0001, "loss": 1.4469, "loss/crossentropy": 2.5613820552825928, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2242073342204094, "step": 8612 }, { "epoch": 0.1286107855068352, "grad_norm": 0.21484375, "grad_norm_var": 0.00386961301167806, "learning_rate": 0.0001, "loss": 1.2545, "loss/crossentropy": 2.670618772506714, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17243334650993347, "step": 8613 }, { "epoch": 0.12862571767745018, "grad_norm": 0.2392578125, "grad_norm_var": 0.002648317813873291, "learning_rate": 0.0001, "loss": 1.4492, "loss/crossentropy": 2.37639844417572, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2265818864107132, "step": 8614 }, { "epoch": 0.12864064984806517, "grad_norm": 0.244140625, "grad_norm_var": 0.0025516351064046225, "learning_rate": 0.0001, "loss": 1.3004, "loss/crossentropy": 2.6559494733810425, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1871398240327835, "step": 8615 }, { "epoch": 0.12865558201868016, "grad_norm": 0.2333984375, "grad_norm_var": 0.002449667453765869, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.5749815702438354, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19326254725456238, "step": 8616 }, { "epoch": 0.12867051418929512, "grad_norm": 0.2470703125, "grad_norm_var": 0.0024298350016276043, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.662237286567688, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21573122590780258, "step": 8617 }, { "epoch": 0.1286854463599101, "grad_norm": 0.216796875, "grad_norm_var": 0.0024762749671936035, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.5637110471725464, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1933935433626175, "step": 8618 }, { "epoch": 0.1287003785305251, "grad_norm": 0.236328125, "grad_norm_var": 0.0024655659993489585, "learning_rate": 0.0001, "loss": 1.2707, "loss/crossentropy": 2.6081289052963257, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.17696669697761536, "step": 8619 }, { "epoch": 0.12871531070114006, "grad_norm": 0.240234375, "grad_norm_var": 0.0024572014808654785, "learning_rate": 0.0001, "loss": 1.4558, "loss/crossentropy": 2.6211273670196533, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.20972123742103577, "step": 8620 }, { "epoch": 0.12873024287175505, "grad_norm": 0.255859375, "grad_norm_var": 0.002380180358886719, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.6426044702529907, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2040492743253708, "step": 8621 }, { "epoch": 0.12874517504237004, "grad_norm": 0.35546875, "grad_norm_var": 0.0030263543128967287, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.538409948348999, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.1786632090806961, "step": 8622 }, { "epoch": 0.128760107212985, "grad_norm": 0.24609375, "grad_norm_var": 0.0029878576596577963, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.794005036354065, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19994032382965088, "step": 8623 }, { "epoch": 0.1287750393836, "grad_norm": 0.23046875, "grad_norm_var": 0.0012117346127827962, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.7600865364074707, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.22325612604618073, "step": 8624 }, { "epoch": 0.12878997155421498, "grad_norm": 0.236328125, "grad_norm_var": 0.0012133121490478516, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.516594171524048, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.2138865515589714, "step": 8625 }, { "epoch": 0.12880490372482997, "grad_norm": 0.236328125, "grad_norm_var": 0.0011578202247619629, "learning_rate": 0.0001, "loss": 1.524, "loss/crossentropy": 2.453326940536499, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.23885934054851532, "step": 8626 }, { "epoch": 0.12881983589544493, "grad_norm": 0.279296875, "grad_norm_var": 0.0011578202247619629, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.19560170173645, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.23356008529663086, "step": 8627 }, { "epoch": 0.12883476806605992, "grad_norm": 0.259765625, "grad_norm_var": 0.001059114933013916, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.71031653881073, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.20162466913461685, "step": 8628 }, { "epoch": 0.1288497002366749, "grad_norm": 0.326171875, "grad_norm_var": 0.0013381600379943847, "learning_rate": 0.0001, "loss": 1.3201, "loss/crossentropy": 2.74746036529541, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17556627839803696, "step": 8629 }, { "epoch": 0.12886463240728988, "grad_norm": 0.25390625, "grad_norm_var": 0.0013204574584960937, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.6552170515060425, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16385141760110855, "step": 8630 }, { "epoch": 0.12887956457790486, "grad_norm": 0.25390625, "grad_norm_var": 0.0013108412424723307, "learning_rate": 0.0001, "loss": 1.2761, "loss/crossentropy": 2.6819032430648804, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1628326177597046, "step": 8631 }, { "epoch": 0.12889449674851985, "grad_norm": 0.2353515625, "grad_norm_var": 0.0013050079345703126, "learning_rate": 0.0001, "loss": 1.4184, "loss/crossentropy": 2.704673171043396, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21524587273597717, "step": 8632 }, { "epoch": 0.12890942891913482, "grad_norm": 0.267578125, "grad_norm_var": 0.0013045907020568849, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 2.381703019142151, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.22057321667671204, "step": 8633 }, { "epoch": 0.1289243610897498, "grad_norm": 0.291015625, "grad_norm_var": 0.0012399633725484213, "learning_rate": 0.0001, "loss": 1.3866, "loss/crossentropy": 2.510714530944824, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19911061972379684, "step": 8634 }, { "epoch": 0.1289392932603648, "grad_norm": 0.2177734375, "grad_norm_var": 0.0013268629709879557, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.5557010173797607, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20487818121910095, "step": 8635 }, { "epoch": 0.12895422543097979, "grad_norm": 0.25, "grad_norm_var": 0.0013050079345703126, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.591485619544983, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20549532771110535, "step": 8636 }, { "epoch": 0.12896915760159475, "grad_norm": 0.21875, "grad_norm_var": 0.0014224847157796223, "learning_rate": 0.0001, "loss": 1.3377, "loss/crossentropy": 2.7627148628234863, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1932019591331482, "step": 8637 }, { "epoch": 0.12898408977220974, "grad_norm": 0.2177734375, "grad_norm_var": 0.0008526762326558431, "learning_rate": 0.0001, "loss": 1.2384, "loss/crossentropy": 2.6521055698394775, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.16813044995069504, "step": 8638 }, { "epoch": 0.12899902194282473, "grad_norm": 0.24609375, "grad_norm_var": 0.0008526762326558431, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.499207854270935, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20846417546272278, "step": 8639 }, { "epoch": 0.1290139541134397, "grad_norm": 0.23828125, "grad_norm_var": 0.000834810733795166, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.5400514602661133, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.1984565407037735, "step": 8640 }, { "epoch": 0.12902888628405468, "grad_norm": 0.23046875, "grad_norm_var": 0.0008490204811096192, "learning_rate": 0.0001, "loss": 1.4257, "loss/crossentropy": 2.630212187767029, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21865352988243103, "step": 8641 }, { "epoch": 0.12904381845466967, "grad_norm": 0.244140625, "grad_norm_var": 0.0008371313412984212, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.584364652633667, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21981758624315262, "step": 8642 }, { "epoch": 0.12905875062528466, "grad_norm": 0.2421875, "grad_norm_var": 0.0007876038551330566, "learning_rate": 0.0001, "loss": 1.2987, "loss/crossentropy": 2.6240739822387695, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1697477325797081, "step": 8643 }, { "epoch": 0.12907368279589962, "grad_norm": 0.224609375, "grad_norm_var": 0.0008170723915100098, "learning_rate": 0.0001, "loss": 1.3618, "loss/crossentropy": 2.6223682165145874, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18996167927980423, "step": 8644 }, { "epoch": 0.1290886149665146, "grad_norm": 0.2060546875, "grad_norm_var": 0.0004568576812744141, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.492342710494995, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.1506277471780777, "step": 8645 }, { "epoch": 0.1291035471371296, "grad_norm": 0.228515625, "grad_norm_var": 0.00044962565104166664, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.384657382965088, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.2003864124417305, "step": 8646 }, { "epoch": 0.12911847930774456, "grad_norm": 0.2138671875, "grad_norm_var": 0.00046640634536743164, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.4977844953536987, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19642189145088196, "step": 8647 }, { "epoch": 0.12913341147835955, "grad_norm": 0.2265625, "grad_norm_var": 0.00047173500061035154, "learning_rate": 0.0001, "loss": 1.3062, "loss/crossentropy": 2.5545865297317505, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1772780939936638, "step": 8648 }, { "epoch": 0.12914834364897454, "grad_norm": 0.259765625, "grad_norm_var": 0.0004418532053629557, "learning_rate": 0.0001, "loss": 1.38, "loss/crossentropy": 2.7549134492874146, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20424485206604004, "step": 8649 }, { "epoch": 0.1291632758195895, "grad_norm": 0.259765625, "grad_norm_var": 0.00026841163635253905, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.5129189491271973, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.21575850993394852, "step": 8650 }, { "epoch": 0.1291782079902045, "grad_norm": 0.203125, "grad_norm_var": 0.00031114816665649413, "learning_rate": 0.0001, "loss": 1.2923, "loss/crossentropy": 2.467241048812866, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18679745495319366, "step": 8651 }, { "epoch": 0.12919314016081948, "grad_norm": 0.2490234375, "grad_norm_var": 0.0003088474273681641, "learning_rate": 0.0001, "loss": 1.3761, "loss/crossentropy": 2.720829725265503, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19641844928264618, "step": 8652 }, { "epoch": 0.12920807233143447, "grad_norm": 0.201171875, "grad_norm_var": 0.00035877227783203124, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.535594940185547, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.173684261739254, "step": 8653 }, { "epoch": 0.12922300450204943, "grad_norm": 0.21875, "grad_norm_var": 0.0003571470578511556, "learning_rate": 0.0001, "loss": 1.2988, "loss/crossentropy": 2.6469335556030273, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18162348121404648, "step": 8654 }, { "epoch": 0.12923793667266442, "grad_norm": 0.25390625, "grad_norm_var": 0.0003769199053446452, "learning_rate": 0.0001, "loss": 1.2514, "loss/crossentropy": 2.4116036891937256, "loss/fcd": 1.091796875, "loss/idx": 10.5, "loss/logits": 0.15961619466543198, "step": 8655 }, { "epoch": 0.1292528688432794, "grad_norm": 0.25, "grad_norm_var": 0.0003964702288309733, "learning_rate": 0.0001, "loss": 1.3379, "loss/crossentropy": 2.6447890996932983, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1973147690296173, "step": 8656 }, { "epoch": 0.12926780101389437, "grad_norm": 0.2451171875, "grad_norm_var": 0.0004069010416666667, "learning_rate": 0.0001, "loss": 1.249, "loss/crossentropy": 2.5081361532211304, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17083506286144257, "step": 8657 }, { "epoch": 0.12928273318450936, "grad_norm": 0.271484375, "grad_norm_var": 0.0004945755004882813, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.372270941734314, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.21209752559661865, "step": 8658 }, { "epoch": 0.12929766535512435, "grad_norm": 0.251953125, "grad_norm_var": 0.0005103905995686848, "learning_rate": 0.0001, "loss": 1.3479, "loss/crossentropy": 2.6777329444885254, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19552749395370483, "step": 8659 }, { "epoch": 0.12931259752573934, "grad_norm": 0.26953125, "grad_norm_var": 0.0005729039510091146, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.709704041481018, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.21029283851385117, "step": 8660 }, { "epoch": 0.1293275296963543, "grad_norm": 0.2373046875, "grad_norm_var": 0.0005006790161132812, "learning_rate": 0.0001, "loss": 1.3535, "loss/crossentropy": 2.641829490661621, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19337564706802368, "step": 8661 }, { "epoch": 0.1293424618669693, "grad_norm": 0.236328125, "grad_norm_var": 0.0004925409952799479, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.6703540086746216, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.22941230982542038, "step": 8662 }, { "epoch": 0.12935739403758428, "grad_norm": 0.248046875, "grad_norm_var": 0.00044428110122680665, "learning_rate": 0.0001, "loss": 1.4992, "loss/crossentropy": 2.7821476459503174, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.24136755615472794, "step": 8663 }, { "epoch": 0.12937232620819925, "grad_norm": 0.298828125, "grad_norm_var": 0.0006160060564676921, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.7562201023101807, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.2261122614145279, "step": 8664 }, { "epoch": 0.12938725837881423, "grad_norm": 0.2060546875, "grad_norm_var": 0.0007058302561442057, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.6579785346984863, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17510061711072922, "step": 8665 }, { "epoch": 0.12940219054942922, "grad_norm": 0.23828125, "grad_norm_var": 0.0006888707478841146, "learning_rate": 0.0001, "loss": 1.4809, "loss/crossentropy": 2.7345610857009888, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.23870404064655304, "step": 8666 }, { "epoch": 0.1294171227200442, "grad_norm": 0.240234375, "grad_norm_var": 0.000580453872680664, "learning_rate": 0.0001, "loss": 1.2853, "loss/crossentropy": 2.6465083360671997, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.1759365200996399, "step": 8667 }, { "epoch": 0.12943205489065918, "grad_norm": 0.251953125, "grad_norm_var": 0.0005826592445373535, "learning_rate": 0.0001, "loss": 1.416, "loss/crossentropy": 2.6852707862854004, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21288354694843292, "step": 8668 }, { "epoch": 0.12944698706127417, "grad_norm": 0.25390625, "grad_norm_var": 0.000448763370513916, "learning_rate": 0.0001, "loss": 1.4369, "loss/crossentropy": 2.6376311779022217, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.2025573030114174, "step": 8669 }, { "epoch": 0.12946191923188916, "grad_norm": 0.232421875, "grad_norm_var": 0.00040670633316040037, "learning_rate": 0.0001, "loss": 1.3647, "loss/crossentropy": 2.597885847091675, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19673596322536469, "step": 8670 }, { "epoch": 0.12947685140250412, "grad_norm": 0.2470703125, "grad_norm_var": 0.00040523211161295575, "learning_rate": 0.0001, "loss": 1.2066, "loss/crossentropy": 2.6271719932556152, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.14014694839715958, "step": 8671 }, { "epoch": 0.1294917835731191, "grad_norm": 0.2197265625, "grad_norm_var": 0.0004570921262105306, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.3306113481521606, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2173743173480034, "step": 8672 }, { "epoch": 0.1295067157437341, "grad_norm": 0.28515625, "grad_norm_var": 0.0005484898885091146, "learning_rate": 0.0001, "loss": 1.2492, "loss/crossentropy": 2.4990739822387695, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.16324874013662338, "step": 8673 }, { "epoch": 0.12952164791434906, "grad_norm": 0.2353515625, "grad_norm_var": 0.0005230545997619629, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.624032735824585, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.1714143380522728, "step": 8674 }, { "epoch": 0.12953658008496405, "grad_norm": 0.29296875, "grad_norm_var": 0.0006552338600158691, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.496997117996216, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.20731692761182785, "step": 8675 }, { "epoch": 0.12955151225557904, "grad_norm": 0.2236328125, "grad_norm_var": 0.0006647586822509765, "learning_rate": 0.0001, "loss": 1.3016, "loss/crossentropy": 2.573498845100403, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18443748354911804, "step": 8676 }, { "epoch": 0.12956644442619403, "grad_norm": 0.326171875, "grad_norm_var": 0.0010469714800516764, "learning_rate": 0.0001, "loss": 1.5749, "loss/crossentropy": 2.5315096378326416, "loss/fcd": 1.3515625, "loss/idx": 10.5, "loss/logits": 0.22331435978412628, "step": 8677 }, { "epoch": 0.129581376596809, "grad_norm": 0.263671875, "grad_norm_var": 0.001035622755686442, "learning_rate": 0.0001, "loss": 1.1864, "loss/crossentropy": 2.501495599746704, "loss/fcd": 1.0390625, "loss/idx": 10.5, "loss/logits": 0.1472909152507782, "step": 8678 }, { "epoch": 0.12959630876742398, "grad_norm": 0.2294921875, "grad_norm_var": 0.001071786880493164, "learning_rate": 0.0001, "loss": 1.2589, "loss/crossentropy": 2.5256437063217163, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17683188617229462, "step": 8679 }, { "epoch": 0.12961124093803897, "grad_norm": 0.42578125, "grad_norm_var": 0.002858098347981771, "learning_rate": 0.0001, "loss": 1.6635, "loss/crossentropy": 2.8528785705566406, "loss/fcd": 1.3671875, "loss/idx": 10.5, "loss/logits": 0.2962986007332802, "step": 8680 }, { "epoch": 0.12962617310865393, "grad_norm": 0.25390625, "grad_norm_var": 0.002652291456858317, "learning_rate": 0.0001, "loss": 1.3674, "loss/crossentropy": 2.580819010734558, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.18769428133964539, "step": 8681 }, { "epoch": 0.12964110527926892, "grad_norm": 0.251953125, "grad_norm_var": 0.0026175777117411294, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.6406067609786987, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.1797114908695221, "step": 8682 }, { "epoch": 0.1296560374498839, "grad_norm": 0.28125, "grad_norm_var": 0.002589539686838786, "learning_rate": 0.0001, "loss": 1.5432, "loss/crossentropy": 2.494299530982971, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.234577938914299, "step": 8683 }, { "epoch": 0.12967096962049887, "grad_norm": 0.306640625, "grad_norm_var": 0.0026656428972880045, "learning_rate": 0.0001, "loss": 1.6393, "loss/crossentropy": 2.253171682357788, "loss/fcd": 1.3984375, "loss/idx": 10.5, "loss/logits": 0.24086108803749084, "step": 8684 }, { "epoch": 0.12968590179111386, "grad_norm": 0.2421875, "grad_norm_var": 0.0027002612749735515, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.6820595264434814, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2001963034272194, "step": 8685 }, { "epoch": 0.12970083396172885, "grad_norm": 0.263671875, "grad_norm_var": 0.00260540246963501, "learning_rate": 0.0001, "loss": 1.3006, "loss/crossentropy": 2.619603991508484, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17557626962661743, "step": 8686 }, { "epoch": 0.12971576613234384, "grad_norm": 0.23828125, "grad_norm_var": 0.0026391983032226563, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.5668468475341797, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1953555792570114, "step": 8687 }, { "epoch": 0.1297306983029588, "grad_norm": 0.2470703125, "grad_norm_var": 0.0024981180826822917, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.737745523452759, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18493062257766724, "step": 8688 }, { "epoch": 0.1297456304735738, "grad_norm": 0.2177734375, "grad_norm_var": 0.0026722232500712077, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.5898303985595703, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17577458173036575, "step": 8689 }, { "epoch": 0.12976056264418878, "grad_norm": 0.32421875, "grad_norm_var": 0.002770217259724935, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.787669062614441, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.22192203253507614, "step": 8690 }, { "epoch": 0.12977549481480374, "grad_norm": 0.2392578125, "grad_norm_var": 0.002816768487294515, "learning_rate": 0.0001, "loss": 1.3467, "loss/crossentropy": 2.760240077972412, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.19824153929948807, "step": 8691 }, { "epoch": 0.12979042698541873, "grad_norm": 0.2578125, "grad_norm_var": 0.002674214045206706, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.7089054584503174, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.2067609280347824, "step": 8692 }, { "epoch": 0.12980535915603372, "grad_norm": 0.33203125, "grad_norm_var": 0.0027178446451822918, "learning_rate": 0.0001, "loss": 1.6902, "loss/crossentropy": 2.3013370037078857, "loss/fcd": 1.4453125, "loss/idx": 10.5, "loss/logits": 0.24492770433425903, "step": 8693 }, { "epoch": 0.12982029132664868, "grad_norm": 0.2314453125, "grad_norm_var": 0.002824715773264567, "learning_rate": 0.0001, "loss": 1.4957, "loss/crossentropy": 2.316255211830139, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.21839133650064468, "step": 8694 }, { "epoch": 0.12983522349726367, "grad_norm": 0.2392578125, "grad_norm_var": 0.0027760783831278484, "learning_rate": 0.0001, "loss": 1.3954, "loss/crossentropy": 2.6461286544799805, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.1883261725306511, "step": 8695 }, { "epoch": 0.12985015566787866, "grad_norm": 0.224609375, "grad_norm_var": 0.001181503136952718, "learning_rate": 0.0001, "loss": 1.3534, "loss/crossentropy": 2.495971441268921, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18929702043533325, "step": 8696 }, { "epoch": 0.12986508783849365, "grad_norm": 0.26953125, "grad_norm_var": 0.0011851906776428223, "learning_rate": 0.0001, "loss": 1.3197, "loss/crossentropy": 2.461315393447876, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.17130638659000397, "step": 8697 }, { "epoch": 0.12988002000910862, "grad_norm": 0.2177734375, "grad_norm_var": 0.001296869913736979, "learning_rate": 0.0001, "loss": 1.2661, "loss/crossentropy": 2.355332851409912, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16841971129179, "step": 8698 }, { "epoch": 0.1298949521797236, "grad_norm": 0.2294921875, "grad_norm_var": 0.0013059258460998535, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.5094900131225586, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19476008415222168, "step": 8699 }, { "epoch": 0.1299098843503386, "grad_norm": 0.271484375, "grad_norm_var": 0.0011414170265197753, "learning_rate": 0.0001, "loss": 1.4207, "loss/crossentropy": 2.9092459678649902, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.2058674395084381, "step": 8700 }, { "epoch": 0.12992481652095356, "grad_norm": 0.22265625, "grad_norm_var": 0.001193074385325114, "learning_rate": 0.0001, "loss": 1.3228, "loss/crossentropy": 2.7011786699295044, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18998046219348907, "step": 8701 }, { "epoch": 0.12993974869156855, "grad_norm": 0.22265625, "grad_norm_var": 0.001232461134592692, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.4889203310012817, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17373355478048325, "step": 8702 }, { "epoch": 0.12995468086218354, "grad_norm": 0.27734375, "grad_norm_var": 0.0012715617815653483, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.388178825378418, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.20377376675605774, "step": 8703 }, { "epoch": 0.12996961303279853, "grad_norm": 0.265625, "grad_norm_var": 0.001282056172688802, "learning_rate": 0.0001, "loss": 1.4338, "loss/crossentropy": 2.648151993751526, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20335753262043, "step": 8704 }, { "epoch": 0.1299845452034135, "grad_norm": 0.2333984375, "grad_norm_var": 0.0012245814005533854, "learning_rate": 0.0001, "loss": 1.3424, "loss/crossentropy": 2.7530845403671265, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.20572806149721146, "step": 8705 }, { "epoch": 0.12999947737402848, "grad_norm": 0.224609375, "grad_norm_var": 0.0009076277414957683, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.644976019859314, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2024790495634079, "step": 8706 }, { "epoch": 0.13001440954464347, "grad_norm": 0.21875, "grad_norm_var": 0.000956277052561442, "learning_rate": 0.0001, "loss": 1.2873, "loss/crossentropy": 2.4630848169326782, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18178289383649826, "step": 8707 }, { "epoch": 0.13002934171525843, "grad_norm": 0.3046875, "grad_norm_var": 0.0011664668718973795, "learning_rate": 0.0001, "loss": 1.5108, "loss/crossentropy": 2.640594482421875, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.21783722937107086, "step": 8708 }, { "epoch": 0.13004427388587342, "grad_norm": 0.2255859375, "grad_norm_var": 0.0006973902384440104, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.425838828086853, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.20914813876152039, "step": 8709 }, { "epoch": 0.1300592060564884, "grad_norm": 0.25, "grad_norm_var": 0.0006917277971903483, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.5503947734832764, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.20864953100681305, "step": 8710 }, { "epoch": 0.13007413822710337, "grad_norm": 0.2197265625, "grad_norm_var": 0.0007268548011779785, "learning_rate": 0.0001, "loss": 1.3561, "loss/crossentropy": 2.456355094909668, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19597165286540985, "step": 8711 }, { "epoch": 0.13008907039771836, "grad_norm": 0.2275390625, "grad_norm_var": 0.0007204532623291016, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.4479026794433594, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19467847049236298, "step": 8712 }, { "epoch": 0.13010400256833335, "grad_norm": 0.251953125, "grad_norm_var": 0.0006765365600585937, "learning_rate": 0.0001, "loss": 1.5591, "loss/crossentropy": 2.6623709201812744, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2544146925210953, "step": 8713 }, { "epoch": 0.13011893473894834, "grad_norm": 0.251953125, "grad_norm_var": 0.00064162810643514, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.665561318397522, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20602738857269287, "step": 8714 }, { "epoch": 0.1301338669095633, "grad_norm": 0.2158203125, "grad_norm_var": 0.0006790121396382649, "learning_rate": 0.0001, "loss": 1.2481, "loss/crossentropy": 2.598279595375061, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17001591622829437, "step": 8715 }, { "epoch": 0.1301487990801783, "grad_norm": 0.216796875, "grad_norm_var": 0.0006563146909077962, "learning_rate": 0.0001, "loss": 1.299, "loss/crossentropy": 2.51216983795166, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18567921221256256, "step": 8716 }, { "epoch": 0.13016373125079328, "grad_norm": 0.2578125, "grad_norm_var": 0.0006554563840230306, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.4902095794677734, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1843111291527748, "step": 8717 }, { "epoch": 0.13017866342140824, "grad_norm": 0.2138671875, "grad_norm_var": 0.0006823857625325521, "learning_rate": 0.0001, "loss": 1.254, "loss/crossentropy": 2.622711420059204, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.16416727751493454, "step": 8718 }, { "epoch": 0.13019359559202323, "grad_norm": 0.23828125, "grad_norm_var": 0.0005882898966471354, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.718404531478882, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.2032979130744934, "step": 8719 }, { "epoch": 0.13020852776263822, "grad_norm": 0.21484375, "grad_norm_var": 0.0005659739176432292, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.5632985830307007, "loss/fcd": 1.02734375, "loss/idx": 10.5, "loss/logits": 0.15308763831853867, "step": 8720 }, { "epoch": 0.1302234599332532, "grad_norm": 0.228515625, "grad_norm_var": 0.0005687355995178222, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.678847908973694, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2110009342432022, "step": 8721 }, { "epoch": 0.13023839210386817, "grad_norm": 0.53515625, "grad_norm_var": 0.006164038181304931, "learning_rate": 0.0001, "loss": 1.6489, "loss/crossentropy": 2.4972656965255737, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.3246881663799286, "step": 8722 }, { "epoch": 0.13025332427448316, "grad_norm": 0.2490234375, "grad_norm_var": 0.006077194213867187, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.624786376953125, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.23094399273395538, "step": 8723 }, { "epoch": 0.13026825644509815, "grad_norm": 0.26171875, "grad_norm_var": 0.005915641784667969, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.3873326778411865, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20706447213888168, "step": 8724 }, { "epoch": 0.1302831886157131, "grad_norm": 0.259765625, "grad_norm_var": 0.0058607061703999834, "learning_rate": 0.0001, "loss": 1.3288, "loss/crossentropy": 2.678820252418518, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18818767368793488, "step": 8725 }, { "epoch": 0.1302981207863281, "grad_norm": 0.2265625, "grad_norm_var": 0.005913158257802327, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.497692823410034, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1760144755244255, "step": 8726 }, { "epoch": 0.1303130529569431, "grad_norm": 0.248046875, "grad_norm_var": 0.005832608540852865, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.4007768630981445, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.185874342918396, "step": 8727 }, { "epoch": 0.13032798512755805, "grad_norm": 0.2451171875, "grad_norm_var": 0.005784972508748373, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.500923752784729, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.22761808335781097, "step": 8728 }, { "epoch": 0.13034291729817304, "grad_norm": 0.2265625, "grad_norm_var": 0.005843035380045573, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.350411057472229, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1887211799621582, "step": 8729 }, { "epoch": 0.13035784946878803, "grad_norm": 0.271484375, "grad_norm_var": 0.005857340494791667, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.5175766944885254, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.18349646031856537, "step": 8730 }, { "epoch": 0.13037278163940302, "grad_norm": 0.2470703125, "grad_norm_var": 0.005747477213541667, "learning_rate": 0.0001, "loss": 1.2442, "loss/crossentropy": 2.6820108890533447, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16213186085224152, "step": 8731 }, { "epoch": 0.13038771381001799, "grad_norm": 0.2216796875, "grad_norm_var": 0.005721628665924072, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.6661161184310913, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20785170048475266, "step": 8732 }, { "epoch": 0.13040264598063298, "grad_norm": 0.2353515625, "grad_norm_var": 0.005756998062133789, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.7345504760742188, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.17144028842449188, "step": 8733 }, { "epoch": 0.13041757815124796, "grad_norm": 0.251953125, "grad_norm_var": 0.00562511682510376, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.580200672149658, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1662888303399086, "step": 8734 }, { "epoch": 0.13043251032186293, "grad_norm": 0.24609375, "grad_norm_var": 0.005606234073638916, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.6433515548706055, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1784341260790825, "step": 8735 }, { "epoch": 0.13044744249247792, "grad_norm": 0.318359375, "grad_norm_var": 0.00564498504002889, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.2920573949813843, "loss/fcd": 1.42578125, "loss/idx": 10.5, "loss/logits": 0.21964259445667267, "step": 8736 }, { "epoch": 0.1304623746630929, "grad_norm": 0.291015625, "grad_norm_var": 0.005568182468414307, "learning_rate": 0.0001, "loss": 1.2988, "loss/crossentropy": 2.5697673559188843, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1776735559105873, "step": 8737 }, { "epoch": 0.1304773068337079, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006336212158203125, "learning_rate": 0.0001, "loss": 1.2883, "loss/crossentropy": 2.7161539793014526, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.19459276646375656, "step": 8738 }, { "epoch": 0.13049223900432286, "grad_norm": 0.267578125, "grad_norm_var": 0.000647890567779541, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.6362061500549316, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.20690853893756866, "step": 8739 }, { "epoch": 0.13050717117493785, "grad_norm": 0.2890625, "grad_norm_var": 0.0007259964942932129, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.35215425491333, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1687571257352829, "step": 8740 }, { "epoch": 0.13052210334555284, "grad_norm": 0.25, "grad_norm_var": 0.0007255196571350098, "learning_rate": 0.0001, "loss": 1.24, "loss/crossentropy": 2.6803377866744995, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.17748712748289108, "step": 8741 }, { "epoch": 0.1305370355161678, "grad_norm": 0.2392578125, "grad_norm_var": 0.0006887912750244141, "learning_rate": 0.0001, "loss": 1.4468, "loss/crossentropy": 2.669974684715271, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.20461004972457886, "step": 8742 }, { "epoch": 0.1305519676867828, "grad_norm": 0.2255859375, "grad_norm_var": 0.0007411599159240723, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.2310362458229065, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.212079755961895, "step": 8743 }, { "epoch": 0.13056689985739778, "grad_norm": 0.234375, "grad_norm_var": 0.0007605234781901042, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.475907564163208, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18772270530462265, "step": 8744 }, { "epoch": 0.13058183202801274, "grad_norm": 0.2353515625, "grad_norm_var": 0.0007344524065653484, "learning_rate": 0.0001, "loss": 1.4476, "loss/crossentropy": 2.535842776298523, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.21715161204338074, "step": 8745 }, { "epoch": 0.13059676419862773, "grad_norm": 0.2177734375, "grad_norm_var": 0.0007858117421468099, "learning_rate": 0.0001, "loss": 1.2951, "loss/crossentropy": 2.6074116230010986, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.1935613676905632, "step": 8746 }, { "epoch": 0.13061169636924272, "grad_norm": 0.21484375, "grad_norm_var": 0.0008638342221577962, "learning_rate": 0.0001, "loss": 1.3395, "loss/crossentropy": 2.460416793823242, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.2028018832206726, "step": 8747 }, { "epoch": 0.1306266285398577, "grad_norm": 0.28125, "grad_norm_var": 0.0008757114410400391, "learning_rate": 0.0001, "loss": 1.4033, "loss/crossentropy": 2.7222551107406616, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.22359323501586914, "step": 8748 }, { "epoch": 0.13064156071047267, "grad_norm": 0.23046875, "grad_norm_var": 0.0008879303932189941, "learning_rate": 0.0001, "loss": 1.3772, "loss/crossentropy": 2.3121402859687805, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19748477637767792, "step": 8749 }, { "epoch": 0.13065649288108766, "grad_norm": 0.2421875, "grad_norm_var": 0.0008933345476786295, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.6985892057418823, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.22456210106611252, "step": 8750 }, { "epoch": 0.13067142505170265, "grad_norm": 0.216796875, "grad_norm_var": 0.0009658137957255045, "learning_rate": 0.0001, "loss": 1.4971, "loss/crossentropy": 2.335131883621216, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.2393207773566246, "step": 8751 }, { "epoch": 0.1306863572223176, "grad_norm": 0.2890625, "grad_norm_var": 0.0007488528887430826, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.555445909500122, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20180723071098328, "step": 8752 }, { "epoch": 0.1307012893929326, "grad_norm": 0.240234375, "grad_norm_var": 0.0006137172381083171, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.3973065614700317, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.23138535022735596, "step": 8753 }, { "epoch": 0.1307162215635476, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006137172381083171, "learning_rate": 0.0001, "loss": 1.2474, "loss/crossentropy": 2.5932332277297974, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1731673702597618, "step": 8754 }, { "epoch": 0.13073115373416255, "grad_norm": 0.216796875, "grad_norm_var": 0.0006157835324605306, "learning_rate": 0.0001, "loss": 1.1986, "loss/crossentropy": 2.7848600149154663, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.15173570811748505, "step": 8755 }, { "epoch": 0.13074608590477754, "grad_norm": 0.2734375, "grad_norm_var": 0.000530715783437093, "learning_rate": 0.0001, "loss": 1.5936, "loss/crossentropy": 2.605392575263977, "loss/fcd": 1.3515625, "loss/idx": 10.5, "loss/logits": 0.24198997020721436, "step": 8756 }, { "epoch": 0.13076101807539253, "grad_norm": 0.234375, "grad_norm_var": 0.0005249937375386555, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.5612789392471313, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.21658140420913696, "step": 8757 }, { "epoch": 0.13077595024600752, "grad_norm": 0.30078125, "grad_norm_var": 0.0007640679677327473, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.5318492650985718, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.18727364391088486, "step": 8758 }, { "epoch": 0.13079088241662248, "grad_norm": 0.318359375, "grad_norm_var": 0.0010890920956929525, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.613611936569214, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.21197688579559326, "step": 8759 }, { "epoch": 0.13080581458723747, "grad_norm": 0.30078125, "grad_norm_var": 0.0012387871742248536, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.836312770843506, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.23343349248170853, "step": 8760 }, { "epoch": 0.13082074675785246, "grad_norm": 0.2373046875, "grad_norm_var": 0.0012344956398010255, "learning_rate": 0.0001, "loss": 1.2194, "loss/crossentropy": 2.5874491930007935, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.16857503354549408, "step": 8761 }, { "epoch": 0.13083567892846742, "grad_norm": 0.263671875, "grad_norm_var": 0.0011513868967692057, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.63097608089447, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20251117646694183, "step": 8762 }, { "epoch": 0.13085061109908241, "grad_norm": 0.26171875, "grad_norm_var": 0.0010331312815348307, "learning_rate": 0.0001, "loss": 1.3429, "loss/crossentropy": 2.5983814001083374, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19056444615125656, "step": 8763 }, { "epoch": 0.1308655432696974, "grad_norm": 0.2890625, "grad_norm_var": 0.00106046994527181, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 2.974693775177002, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.2154785543680191, "step": 8764 }, { "epoch": 0.1308804754403124, "grad_norm": 0.2099609375, "grad_norm_var": 0.0011651953061421712, "learning_rate": 0.0001, "loss": 1.2165, "loss/crossentropy": 2.657555341720581, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.15786537528038025, "step": 8765 }, { "epoch": 0.13089540761092736, "grad_norm": 0.33984375, "grad_norm_var": 0.0015569965044657389, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.6897501945495605, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.17433889210224152, "step": 8766 }, { "epoch": 0.13091033978154235, "grad_norm": 0.2265625, "grad_norm_var": 0.0015015244483947755, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.4808074235916138, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17186101526021957, "step": 8767 }, { "epoch": 0.13092527195215733, "grad_norm": 0.28125, "grad_norm_var": 0.0014798442522684734, "learning_rate": 0.0001, "loss": 1.3261, "loss/crossentropy": 2.62634539604187, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17370712757110596, "step": 8768 }, { "epoch": 0.1309402041227723, "grad_norm": 0.251953125, "grad_norm_var": 0.0014511386553446451, "learning_rate": 0.0001, "loss": 1.2495, "loss/crossentropy": 2.887893319129944, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.16357643902301788, "step": 8769 }, { "epoch": 0.1309551362933873, "grad_norm": 0.203125, "grad_norm_var": 0.001627333958943685, "learning_rate": 0.0001, "loss": 1.2596, "loss/crossentropy": 2.620288372039795, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17752590775489807, "step": 8770 }, { "epoch": 0.13097006846400228, "grad_norm": 0.275390625, "grad_norm_var": 0.0014804681142171225, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.5079725980758667, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.23875436931848526, "step": 8771 }, { "epoch": 0.13098500063461724, "grad_norm": 0.208984375, "grad_norm_var": 0.001682408650716146, "learning_rate": 0.0001, "loss": 1.2021, "loss/crossentropy": 2.641211152076721, "loss/fcd": 1.0390625, "loss/idx": 10.5, "loss/logits": 0.16308272629976273, "step": 8772 }, { "epoch": 0.13099993280523223, "grad_norm": 0.2099609375, "grad_norm_var": 0.001811850070953369, "learning_rate": 0.0001, "loss": 1.2122, "loss/crossentropy": 2.4023520946502686, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.16137825697660446, "step": 8773 }, { "epoch": 0.13101486497584722, "grad_norm": 0.228515625, "grad_norm_var": 0.0017565687497456868, "learning_rate": 0.0001, "loss": 1.191, "loss/crossentropy": 2.704079270362854, "loss/fcd": 1.03515625, "loss/idx": 10.5, "loss/logits": 0.1558886468410492, "step": 8774 }, { "epoch": 0.1310297971464622, "grad_norm": 0.232421875, "grad_norm_var": 0.0015110929807027182, "learning_rate": 0.0001, "loss": 1.4569, "loss/crossentropy": 2.4026641845703125, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.22639213502407074, "step": 8775 }, { "epoch": 0.13104472931707717, "grad_norm": 0.24609375, "grad_norm_var": 0.0013370792071024577, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.535096526145935, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21960995346307755, "step": 8776 }, { "epoch": 0.13105966148769216, "grad_norm": 0.251953125, "grad_norm_var": 0.0013298670450846354, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.661816358566284, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.20330870151519775, "step": 8777 }, { "epoch": 0.13107459365830715, "grad_norm": 0.2890625, "grad_norm_var": 0.0014205773671468098, "learning_rate": 0.0001, "loss": 1.3263, "loss/crossentropy": 2.6512582302093506, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18175207823514938, "step": 8778 }, { "epoch": 0.1310895258289221, "grad_norm": 0.263671875, "grad_norm_var": 0.0014237721761067709, "learning_rate": 0.0001, "loss": 1.504, "loss/crossentropy": 2.5149089097976685, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.2227446809411049, "step": 8779 }, { "epoch": 0.1311044579995371, "grad_norm": 0.216796875, "grad_norm_var": 0.001378488540649414, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.6071537733078003, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18111396580934525, "step": 8780 }, { "epoch": 0.1311193901701521, "grad_norm": 0.271484375, "grad_norm_var": 0.0013196587562561036, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.61593234539032, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.213753342628479, "step": 8781 }, { "epoch": 0.13113432234076708, "grad_norm": 0.25, "grad_norm_var": 0.0007457057634989421, "learning_rate": 0.0001, "loss": 1.5221, "loss/crossentropy": 2.5947054624557495, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2565002143383026, "step": 8782 }, { "epoch": 0.13114925451138204, "grad_norm": 0.2578125, "grad_norm_var": 0.0007332444190979003, "learning_rate": 0.0001, "loss": 1.3628, "loss/crossentropy": 2.5042314529418945, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19089290499687195, "step": 8783 }, { "epoch": 0.13116418668199703, "grad_norm": 0.248046875, "grad_norm_var": 0.0006467779477437337, "learning_rate": 0.0001, "loss": 1.4778, "loss/crossentropy": 2.7153987884521484, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.2317527011036873, "step": 8784 }, { "epoch": 0.13117911885261202, "grad_norm": 0.251953125, "grad_norm_var": 0.0006467779477437337, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.7072259187698364, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2168460339307785, "step": 8785 }, { "epoch": 0.13119405102322698, "grad_norm": 0.2734375, "grad_norm_var": 0.0005718191464742024, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.3749125003814697, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.16710194945335388, "step": 8786 }, { "epoch": 0.13120898319384197, "grad_norm": 0.2578125, "grad_norm_var": 0.0005280454953511556, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.6166731119155884, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1703224703669548, "step": 8787 }, { "epoch": 0.13122391536445696, "grad_norm": 0.234375, "grad_norm_var": 0.0004383683204650879, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.822224497795105, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.20613382011651993, "step": 8788 }, { "epoch": 0.13123884753507192, "grad_norm": 0.302734375, "grad_norm_var": 0.0004938602447509766, "learning_rate": 0.0001, "loss": 1.5208, "loss/crossentropy": 2.7455793619155884, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.22387707233428955, "step": 8789 }, { "epoch": 0.1312537797056869, "grad_norm": 0.224609375, "grad_norm_var": 0.0005084832509358723, "learning_rate": 0.0001, "loss": 1.3105, "loss/crossentropy": 2.9308096170425415, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1815713867545128, "step": 8790 }, { "epoch": 0.1312687118763019, "grad_norm": 0.25390625, "grad_norm_var": 0.00047403971354166666, "learning_rate": 0.0001, "loss": 1.3252, "loss/crossentropy": 2.6900964975357056, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18853119015693665, "step": 8791 }, { "epoch": 0.1312836440469169, "grad_norm": 0.23046875, "grad_norm_var": 0.0005096435546875, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.622345447540283, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19936061650514603, "step": 8792 }, { "epoch": 0.13129857621753185, "grad_norm": 0.2119140625, "grad_norm_var": 0.0006254792213439941, "learning_rate": 0.0001, "loss": 1.255, "loss/crossentropy": 2.623387932777405, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17683328688144684, "step": 8793 }, { "epoch": 0.13131350838814684, "grad_norm": 0.244140625, "grad_norm_var": 0.0005318919817606608, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.5853466987609863, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18018028885126114, "step": 8794 }, { "epoch": 0.13132844055876183, "grad_norm": 0.2353515625, "grad_norm_var": 0.0005287806193033854, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.7651220560073853, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.22225824743509293, "step": 8795 }, { "epoch": 0.1313433727293768, "grad_norm": 0.2265625, "grad_norm_var": 0.0004943688710530599, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.551769971847534, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2017657458782196, "step": 8796 }, { "epoch": 0.13135830489999178, "grad_norm": 0.22265625, "grad_norm_var": 0.0004931767781575521, "learning_rate": 0.0001, "loss": 1.2315, "loss/crossentropy": 2.6074379682540894, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.15723730623722076, "step": 8797 }, { "epoch": 0.13137323707060677, "grad_norm": 0.25, "grad_norm_var": 0.0004931767781575521, "learning_rate": 0.0001, "loss": 1.5213, "loss/crossentropy": 2.4691524505615234, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.24786216765642166, "step": 8798 }, { "epoch": 0.13138816924122176, "grad_norm": 0.2578125, "grad_norm_var": 0.0004931767781575521, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.5143643617630005, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17498572915792465, "step": 8799 }, { "epoch": 0.13140310141183673, "grad_norm": 0.22265625, "grad_norm_var": 0.0005243778228759766, "learning_rate": 0.0001, "loss": 1.2706, "loss/crossentropy": 2.657851219177246, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.18468638509511948, "step": 8800 }, { "epoch": 0.13141803358245172, "grad_norm": 0.25390625, "grad_norm_var": 0.0005267461140950521, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.4166043996810913, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19560940563678741, "step": 8801 }, { "epoch": 0.1314329657530667, "grad_norm": 0.265625, "grad_norm_var": 0.0004997889200846355, "learning_rate": 0.0001, "loss": 1.2897, "loss/crossentropy": 2.7795244455337524, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1998293399810791, "step": 8802 }, { "epoch": 0.13144789792368167, "grad_norm": 0.263671875, "grad_norm_var": 0.0005131880442301432, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.5743558406829834, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.22930602729320526, "step": 8803 }, { "epoch": 0.13146283009429666, "grad_norm": 0.21875, "grad_norm_var": 0.0005480289459228515, "learning_rate": 0.0001, "loss": 1.4565, "loss/crossentropy": 2.3930879831314087, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.2182542383670807, "step": 8804 }, { "epoch": 0.13147776226491165, "grad_norm": 0.26171875, "grad_norm_var": 0.0003253936767578125, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.5534332990646362, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18666132539510727, "step": 8805 }, { "epoch": 0.1314926944355266, "grad_norm": 0.197265625, "grad_norm_var": 0.0004290898640950521, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.262376070022583, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17239663749933243, "step": 8806 }, { "epoch": 0.1315076266061416, "grad_norm": 0.26953125, "grad_norm_var": 0.0004763921101888021, "learning_rate": 0.0001, "loss": 1.4896, "loss/crossentropy": 2.2711617946624756, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.21617892384529114, "step": 8807 }, { "epoch": 0.1315225587767566, "grad_norm": 0.2412109375, "grad_norm_var": 0.00047066609064737953, "learning_rate": 0.0001, "loss": 1.3181, "loss/crossentropy": 2.655574679374695, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18524915724992752, "step": 8808 }, { "epoch": 0.13153749094737158, "grad_norm": 0.2021484375, "grad_norm_var": 0.0005134224891662597, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.532176971435547, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18055003136396408, "step": 8809 }, { "epoch": 0.13155242311798654, "grad_norm": 0.23046875, "grad_norm_var": 0.0005167603492736816, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.6680381298065186, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.22427187860012054, "step": 8810 }, { "epoch": 0.13156735528860153, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005345622698465983, "learning_rate": 0.0001, "loss": 1.2246, "loss/crossentropy": 2.6017990112304688, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.166054867208004, "step": 8811 }, { "epoch": 0.13158228745921652, "grad_norm": 0.2421875, "grad_norm_var": 0.0005262970924377441, "learning_rate": 0.0001, "loss": 1.4688, "loss/crossentropy": 2.56117844581604, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.23443001508712769, "step": 8812 }, { "epoch": 0.13159721962983148, "grad_norm": 0.2314453125, "grad_norm_var": 0.0005121707916259765, "learning_rate": 0.0001, "loss": 1.3845, "loss/crossentropy": 2.538038492202759, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.1930662840604782, "step": 8813 }, { "epoch": 0.13161215180044647, "grad_norm": 0.220703125, "grad_norm_var": 0.0005243301391601562, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.2836846113204956, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18318768590688705, "step": 8814 }, { "epoch": 0.13162708397106146, "grad_norm": 0.2373046875, "grad_norm_var": 0.0004952073097229004, "learning_rate": 0.0001, "loss": 1.3781, "loss/crossentropy": 2.649691104888916, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20232588052749634, "step": 8815 }, { "epoch": 0.13164201614167642, "grad_norm": 0.234375, "grad_norm_var": 0.00048252344131469724, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.7176660299301147, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1774556264281273, "step": 8816 }, { "epoch": 0.1316569483122914, "grad_norm": 0.2119140625, "grad_norm_var": 0.0004980723063151042, "learning_rate": 0.0001, "loss": 1.2988, "loss/crossentropy": 2.7288894653320312, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1854807436466217, "step": 8817 }, { "epoch": 0.1316718804829064, "grad_norm": 0.2158203125, "grad_norm_var": 0.0004455844561258952, "learning_rate": 0.0001, "loss": 1.3063, "loss/crossentropy": 2.3551535606384277, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17740409821271896, "step": 8818 }, { "epoch": 0.1316868126535214, "grad_norm": 0.212890625, "grad_norm_var": 0.0003873149553934733, "learning_rate": 0.0001, "loss": 1.2296, "loss/crossentropy": 2.526303768157959, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.15926600247621536, "step": 8819 }, { "epoch": 0.13170174482413635, "grad_norm": 0.2412109375, "grad_norm_var": 0.0003908793131510417, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.451197624206543, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17025967687368393, "step": 8820 }, { "epoch": 0.13171667699475134, "grad_norm": 0.302734375, "grad_norm_var": 0.0006722609202067057, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.3199455738067627, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2375505417585373, "step": 8821 }, { "epoch": 0.13173160916536633, "grad_norm": 0.2265625, "grad_norm_var": 0.0005900065104166666, "learning_rate": 0.0001, "loss": 1.249, "loss/crossentropy": 2.7519551515579224, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.18258299678564072, "step": 8822 }, { "epoch": 0.1317465413359813, "grad_norm": 0.2578125, "grad_norm_var": 0.0005428949991861979, "learning_rate": 0.0001, "loss": 1.3143, "loss/crossentropy": 2.4902899265289307, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.16195746511220932, "step": 8823 }, { "epoch": 0.13176147350659628, "grad_norm": 0.267578125, "grad_norm_var": 0.0006146709124247233, "learning_rate": 0.0001, "loss": 1.4738, "loss/crossentropy": 2.2641589641571045, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.21210463345050812, "step": 8824 }, { "epoch": 0.13177640567721127, "grad_norm": 0.2255859375, "grad_norm_var": 0.0005469600359598796, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.4768952131271362, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20289967209100723, "step": 8825 }, { "epoch": 0.13179133784782626, "grad_norm": 0.271484375, "grad_norm_var": 0.0006203929583231608, "learning_rate": 0.0001, "loss": 1.4968, "loss/crossentropy": 2.7176687717437744, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.23513050377368927, "step": 8826 }, { "epoch": 0.13180627001844122, "grad_norm": 0.2177734375, "grad_norm_var": 0.0006302793820699056, "learning_rate": 0.0001, "loss": 1.298, "loss/crossentropy": 2.574256181716919, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18864907324314117, "step": 8827 }, { "epoch": 0.1318212021890562, "grad_norm": 0.2421875, "grad_norm_var": 0.0006302793820699056, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.8222309350967407, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18142211437225342, "step": 8828 }, { "epoch": 0.1318361343596712, "grad_norm": 0.25390625, "grad_norm_var": 0.0006404240926106771, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.7361135482788086, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1984899640083313, "step": 8829 }, { "epoch": 0.13185106653028617, "grad_norm": 0.2236328125, "grad_norm_var": 0.0006334265073140462, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.5138871669769287, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.20621711760759354, "step": 8830 }, { "epoch": 0.13186599870090115, "grad_norm": 0.25390625, "grad_norm_var": 0.0006443023681640625, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.607467293739319, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.19903773814439774, "step": 8831 }, { "epoch": 0.13188093087151614, "grad_norm": 0.26171875, "grad_norm_var": 0.0006661097208658854, "learning_rate": 0.0001, "loss": 1.3798, "loss/crossentropy": 2.8251949548721313, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20787649601697922, "step": 8832 }, { "epoch": 0.1318958630421311, "grad_norm": 0.306640625, "grad_norm_var": 0.0008353193600972494, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.611420154571533, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1840296983718872, "step": 8833 }, { "epoch": 0.1319107952127461, "grad_norm": 0.224609375, "grad_norm_var": 0.0008014520009358724, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.7321581840515137, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.22837185859680176, "step": 8834 }, { "epoch": 0.13192572738336109, "grad_norm": 0.294921875, "grad_norm_var": 0.0008228143056233724, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.6320571899414062, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21649831533432007, "step": 8835 }, { "epoch": 0.13194065955397608, "grad_norm": 0.255859375, "grad_norm_var": 0.0008102377255757649, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.7387192249298096, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.21237428486347198, "step": 8836 }, { "epoch": 0.13195559172459104, "grad_norm": 0.26171875, "grad_norm_var": 0.0006566961606343587, "learning_rate": 0.0001, "loss": 1.2452, "loss/crossentropy": 2.560963988304138, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.16708487272262573, "step": 8837 }, { "epoch": 0.13197052389520603, "grad_norm": 0.357421875, "grad_norm_var": 0.0012679696083068848, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.676060914993286, "loss/fcd": 1.34375, "loss/idx": 10.5, "loss/logits": 0.2869645953178406, "step": 8838 }, { "epoch": 0.13198545606582102, "grad_norm": 0.2275390625, "grad_norm_var": 0.0013383070627848307, "learning_rate": 0.0001, "loss": 1.3076, "loss/crossentropy": 2.864909529685974, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.19431602954864502, "step": 8839 }, { "epoch": 0.13200038823643598, "grad_norm": 0.287109375, "grad_norm_var": 0.0013840834299723307, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.333187699317932, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.18117347359657288, "step": 8840 }, { "epoch": 0.13201532040705097, "grad_norm": 0.2490234375, "grad_norm_var": 0.0013096968332926432, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.6334019899368286, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21159139275550842, "step": 8841 }, { "epoch": 0.13203025257766596, "grad_norm": 0.2333984375, "grad_norm_var": 0.0013513843218485515, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.580502986907959, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19653500616550446, "step": 8842 }, { "epoch": 0.13204518474828095, "grad_norm": 0.296875, "grad_norm_var": 0.0013027826944986979, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.594636559486389, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.23712994903326035, "step": 8843 }, { "epoch": 0.1320601169188959, "grad_norm": 0.25390625, "grad_norm_var": 0.0012766520182291667, "learning_rate": 0.0001, "loss": 1.3299, "loss/crossentropy": 2.2228610515594482, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.16972333192825317, "step": 8844 }, { "epoch": 0.1320750490895109, "grad_norm": 0.208984375, "grad_norm_var": 0.001470041275024414, "learning_rate": 0.0001, "loss": 1.2144, "loss/crossentropy": 2.5766462087631226, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.15968143939971924, "step": 8845 }, { "epoch": 0.1320899812601259, "grad_norm": 0.271484375, "grad_norm_var": 0.0013662616411844889, "learning_rate": 0.0001, "loss": 1.5498, "loss/crossentropy": 2.4207650423049927, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.21777526289224625, "step": 8846 }, { "epoch": 0.13210491343074085, "grad_norm": 0.267578125, "grad_norm_var": 0.0013571381568908691, "learning_rate": 0.0001, "loss": 1.4561, "loss/crossentropy": 2.508748412132263, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.19823967665433884, "step": 8847 }, { "epoch": 0.13211984560135584, "grad_norm": 0.23828125, "grad_norm_var": 0.0014053940773010253, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.5913901329040527, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18171925842761993, "step": 8848 }, { "epoch": 0.13213477777197083, "grad_norm": 0.2265625, "grad_norm_var": 0.001358473300933838, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.6062854528427124, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.16214654594659805, "step": 8849 }, { "epoch": 0.1321497099425858, "grad_norm": 0.2138671875, "grad_norm_var": 0.0014159520467122396, "learning_rate": 0.0001, "loss": 1.2947, "loss/crossentropy": 2.6220027208328247, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.1852966696023941, "step": 8850 }, { "epoch": 0.13216464211320078, "grad_norm": 0.240234375, "grad_norm_var": 0.0013411839803059895, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.724003553390503, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18381881713867188, "step": 8851 }, { "epoch": 0.13217957428381577, "grad_norm": 0.232421875, "grad_norm_var": 0.0013747533162434896, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.868537187576294, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19692016392946243, "step": 8852 }, { "epoch": 0.13219450645443076, "grad_norm": 0.26953125, "grad_norm_var": 0.0013864517211914062, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.675292134284973, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.16478679329156876, "step": 8853 }, { "epoch": 0.13220943862504572, "grad_norm": 0.2353515625, "grad_norm_var": 0.0006448705991109212, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.5507049560546875, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17924963682889938, "step": 8854 }, { "epoch": 0.1322243707956607, "grad_norm": 0.296875, "grad_norm_var": 0.0007653395334879557, "learning_rate": 0.0001, "loss": 1.5648, "loss/crossentropy": 2.699708104133606, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.2640087306499481, "step": 8855 }, { "epoch": 0.1322393029662757, "grad_norm": 0.2392578125, "grad_norm_var": 0.0006802519162495931, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.347045421600342, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.24257220327854156, "step": 8856 }, { "epoch": 0.13225423513689066, "grad_norm": 0.2392578125, "grad_norm_var": 0.0006853381792704265, "learning_rate": 0.0001, "loss": 1.3507, "loss/crossentropy": 2.6120657920837402, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19050665944814682, "step": 8857 }, { "epoch": 0.13226916730750565, "grad_norm": 0.2734375, "grad_norm_var": 0.0007089614868164062, "learning_rate": 0.0001, "loss": 1.4089, "loss/crossentropy": 2.4505022764205933, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21363038569688797, "step": 8858 }, { "epoch": 0.13228409947812064, "grad_norm": 0.220703125, "grad_norm_var": 0.000598001480102539, "learning_rate": 0.0001, "loss": 1.3233, "loss/crossentropy": 2.4681421518325806, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19044844061136246, "step": 8859 }, { "epoch": 0.13229903164873563, "grad_norm": 0.236328125, "grad_norm_var": 0.0005975723266601563, "learning_rate": 0.0001, "loss": 1.298, "loss/crossentropy": 2.6271255016326904, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18860077857971191, "step": 8860 }, { "epoch": 0.1323139638193506, "grad_norm": 0.28515625, "grad_norm_var": 0.0006006717681884766, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.6491132974624634, "loss/fcd": 1.34375, "loss/idx": 10.5, "loss/logits": 0.2686386704444885, "step": 8861 }, { "epoch": 0.13232889598996558, "grad_norm": 0.2421875, "grad_norm_var": 0.0005670547485351563, "learning_rate": 0.0001, "loss": 1.3021, "loss/crossentropy": 2.7135428190231323, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.18100712448358536, "step": 8862 }, { "epoch": 0.13234382816058057, "grad_norm": 0.259765625, "grad_norm_var": 0.0005497614542643229, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.557835578918457, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19341476261615753, "step": 8863 }, { "epoch": 0.13235876033119554, "grad_norm": 0.2236328125, "grad_norm_var": 0.0005798617998758952, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.6160695552825928, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16485872864723206, "step": 8864 }, { "epoch": 0.13237369250181052, "grad_norm": 0.248046875, "grad_norm_var": 0.000553286075592041, "learning_rate": 0.0001, "loss": 1.3121, "loss/crossentropy": 2.5891222953796387, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17927560210227966, "step": 8865 }, { "epoch": 0.13238862467242551, "grad_norm": 0.236328125, "grad_norm_var": 0.0004848321278889974, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.5027376413345337, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.22887559235095978, "step": 8866 }, { "epoch": 0.13240355684304048, "grad_norm": 0.283203125, "grad_norm_var": 0.0005519707997639974, "learning_rate": 0.0001, "loss": 1.4736, "loss/crossentropy": 2.5434399843215942, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.21189425885677338, "step": 8867 }, { "epoch": 0.13241848901365547, "grad_norm": 0.2265625, "grad_norm_var": 0.0005688985188802083, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.5540053844451904, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1699429750442505, "step": 8868 }, { "epoch": 0.13243342118427046, "grad_norm": 0.2255859375, "grad_norm_var": 0.0005808790524800618, "learning_rate": 0.0001, "loss": 1.3316, "loss/crossentropy": 2.691591739654541, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1910201758146286, "step": 8869 }, { "epoch": 0.13244835335488545, "grad_norm": 0.2236328125, "grad_norm_var": 0.00060958464940389, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.7731395959854126, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1609116792678833, "step": 8870 }, { "epoch": 0.1324632855255004, "grad_norm": 0.220703125, "grad_norm_var": 0.00047072966893514, "learning_rate": 0.0001, "loss": 1.3647, "loss/crossentropy": 2.5677813291549683, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.1967625394463539, "step": 8871 }, { "epoch": 0.1324782176961154, "grad_norm": 0.232421875, "grad_norm_var": 0.0004768212636311849, "learning_rate": 0.0001, "loss": 1.3035, "loss/crossentropy": 2.60045325756073, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17854107171297073, "step": 8872 }, { "epoch": 0.1324931498667304, "grad_norm": 0.318359375, "grad_norm_var": 0.0008357008298238119, "learning_rate": 0.0001, "loss": 1.4748, "loss/crossentropy": 2.7334078550338745, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.22479411959648132, "step": 8873 }, { "epoch": 0.13250808203734535, "grad_norm": 0.251953125, "grad_norm_var": 0.0007895429929097494, "learning_rate": 0.0001, "loss": 1.5091, "loss/crossentropy": 2.5960934162139893, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.23952510952949524, "step": 8874 }, { "epoch": 0.13252301420796034, "grad_norm": 0.208984375, "grad_norm_var": 0.0008375128110249837, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.375941753387451, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21906881034374237, "step": 8875 }, { "epoch": 0.13253794637857533, "grad_norm": 0.24609375, "grad_norm_var": 0.0008319497108459472, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.7368152141571045, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.19715134799480438, "step": 8876 }, { "epoch": 0.1325528785491903, "grad_norm": 0.26953125, "grad_norm_var": 0.0007651925086975097, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.701388120651245, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19837487488985062, "step": 8877 }, { "epoch": 0.13256781071980528, "grad_norm": 0.2470703125, "grad_norm_var": 0.0007649739583333333, "learning_rate": 0.0001, "loss": 1.3382, "loss/crossentropy": 2.533538818359375, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18978386372327805, "step": 8878 }, { "epoch": 0.13258274289042027, "grad_norm": 0.2255859375, "grad_norm_var": 0.0007712324460347493, "learning_rate": 0.0001, "loss": 1.2057, "loss/crossentropy": 2.5741156339645386, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.1510222926735878, "step": 8879 }, { "epoch": 0.13259767506103526, "grad_norm": 0.2353515625, "grad_norm_var": 0.0007495840390523274, "learning_rate": 0.0001, "loss": 1.2804, "loss/crossentropy": 2.606109142303467, "loss/fcd": 1.107421875, "loss/idx": 10.5, "loss/logits": 0.17298530042171478, "step": 8880 }, { "epoch": 0.13261260723165022, "grad_norm": 0.2578125, "grad_norm_var": 0.0007611870765686035, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 2.7616093158721924, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21038750559091568, "step": 8881 }, { "epoch": 0.1326275394022652, "grad_norm": 0.26171875, "grad_norm_var": 0.0007744113604227702, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.7328842878341675, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1883678138256073, "step": 8882 }, { "epoch": 0.1326424715728802, "grad_norm": 0.21875, "grad_norm_var": 0.0007135669390360514, "learning_rate": 0.0001, "loss": 1.2439, "loss/crossentropy": 2.5153945684432983, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1618361547589302, "step": 8883 }, { "epoch": 0.13265740374349516, "grad_norm": 0.31640625, "grad_norm_var": 0.001034541924794515, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.5413447618484497, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.23665080964565277, "step": 8884 }, { "epoch": 0.13267233591411015, "grad_norm": 0.416015625, "grad_norm_var": 0.002744658788045247, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.3225817680358887, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19897138327360153, "step": 8885 }, { "epoch": 0.13268726808472514, "grad_norm": 0.267578125, "grad_norm_var": 0.0026557882626851398, "learning_rate": 0.0001, "loss": 1.4307, "loss/crossentropy": 2.6371665000915527, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.22367160022258759, "step": 8886 }, { "epoch": 0.13270220025534013, "grad_norm": 0.3046875, "grad_norm_var": 0.002632550398508708, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.520174264907837, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.24286136031150818, "step": 8887 }, { "epoch": 0.1327171324259551, "grad_norm": 0.244140625, "grad_norm_var": 0.0025864879290262857, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.4038987159729004, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19895411282777786, "step": 8888 }, { "epoch": 0.13273206459657008, "grad_norm": 0.228515625, "grad_norm_var": 0.0024892449378967284, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.667299270629883, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.2360628917813301, "step": 8889 }, { "epoch": 0.13274699676718507, "grad_norm": 0.28125, "grad_norm_var": 0.0025016427040100097, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.8773406744003296, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.18091747164726257, "step": 8890 }, { "epoch": 0.13276192893780003, "grad_norm": 0.259765625, "grad_norm_var": 0.0022879878679911295, "learning_rate": 0.0001, "loss": 1.416, "loss/crossentropy": 2.8568522930145264, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20510869473218918, "step": 8891 }, { "epoch": 0.13277686110841502, "grad_norm": 0.251953125, "grad_norm_var": 0.002273396650950114, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.3920687437057495, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19075965881347656, "step": 8892 }, { "epoch": 0.13279179327903, "grad_norm": 0.2314453125, "grad_norm_var": 0.0023556868235270183, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.5418816804885864, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18871252983808517, "step": 8893 }, { "epoch": 0.13280672544964497, "grad_norm": 0.263671875, "grad_norm_var": 0.002332111199696859, "learning_rate": 0.0001, "loss": 1.5255, "loss/crossentropy": 2.918432116508484, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.24423928558826447, "step": 8894 }, { "epoch": 0.13282165762025996, "grad_norm": 0.25, "grad_norm_var": 0.002236048380533854, "learning_rate": 0.0001, "loss": 1.2878, "loss/crossentropy": 2.5414304733276367, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17063161730766296, "step": 8895 }, { "epoch": 0.13283658979087495, "grad_norm": 0.2451171875, "grad_norm_var": 0.002199411392211914, "learning_rate": 0.0001, "loss": 1.381, "loss/crossentropy": 2.7063578367233276, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20519881695508957, "step": 8896 }, { "epoch": 0.13285152196148994, "grad_norm": 0.2353515625, "grad_norm_var": 0.002263478438059489, "learning_rate": 0.0001, "loss": 1.3866, "loss/crossentropy": 2.702838659286499, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19909017533063889, "step": 8897 }, { "epoch": 0.1328664541321049, "grad_norm": 0.265625, "grad_norm_var": 0.0022615393002827964, "learning_rate": 0.0001, "loss": 1.3618, "loss/crossentropy": 2.409602999687195, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18995538353919983, "step": 8898 }, { "epoch": 0.1328813863027199, "grad_norm": 0.26953125, "grad_norm_var": 0.0020925164222717284, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.824800968170166, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.21993764489889145, "step": 8899 }, { "epoch": 0.13289631847333488, "grad_norm": 0.2451171875, "grad_norm_var": 0.0019756158192952475, "learning_rate": 0.0001, "loss": 1.3667, "loss/crossentropy": 2.452968120574951, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.18699291348457336, "step": 8900 }, { "epoch": 0.13291125064394985, "grad_norm": 0.2236328125, "grad_norm_var": 0.0004467924435933431, "learning_rate": 0.0001, "loss": 1.2687, "loss/crossentropy": 2.5447988510131836, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.1827152594923973, "step": 8901 }, { "epoch": 0.13292618281456484, "grad_norm": 0.28125, "grad_norm_var": 0.00048284133275349936, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.7209649085998535, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.23045474290847778, "step": 8902 }, { "epoch": 0.13294111498517983, "grad_norm": 0.263671875, "grad_norm_var": 0.00031661589940389, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.69802987575531, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21726436913013458, "step": 8903 }, { "epoch": 0.13295604715579482, "grad_norm": 0.240234375, "grad_norm_var": 0.00032192468643188477, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.58715558052063, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.22075870633125305, "step": 8904 }, { "epoch": 0.13297097932640978, "grad_norm": 0.2216796875, "grad_norm_var": 0.0003464857737223307, "learning_rate": 0.0001, "loss": 1.2568, "loss/crossentropy": 2.7357699871063232, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17475451529026031, "step": 8905 }, { "epoch": 0.13298591149702477, "grad_norm": 0.234375, "grad_norm_var": 0.0002999464670817057, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.3557883501052856, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1931678056716919, "step": 8906 }, { "epoch": 0.13300084366763976, "grad_norm": 0.26953125, "grad_norm_var": 0.0003200531005859375, "learning_rate": 0.0001, "loss": 1.5184, "loss/crossentropy": 2.4545738697052, "loss/fcd": 1.29296875, "loss/idx": 10.5, "loss/logits": 0.2254096418619156, "step": 8907 }, { "epoch": 0.13301577583825472, "grad_norm": 0.26953125, "grad_norm_var": 0.0003450870513916016, "learning_rate": 0.0001, "loss": 1.5411, "loss/crossentropy": 2.4621431827545166, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.24422481656074524, "step": 8908 }, { "epoch": 0.1330307080088697, "grad_norm": 0.2412109375, "grad_norm_var": 0.0003260930379231771, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.5893882513046265, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17779848724603653, "step": 8909 }, { "epoch": 0.1330456401794847, "grad_norm": 0.255859375, "grad_norm_var": 0.00031693776448567707, "learning_rate": 0.0001, "loss": 1.4793, "loss/crossentropy": 2.5856130123138428, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.22144416719675064, "step": 8910 }, { "epoch": 0.13306057235009966, "grad_norm": 0.2060546875, "grad_norm_var": 0.0004419287045796712, "learning_rate": 0.0001, "loss": 1.2605, "loss/crossentropy": 2.5895886421203613, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.16676948219537735, "step": 8911 }, { "epoch": 0.13307550452071465, "grad_norm": 0.2412109375, "grad_norm_var": 0.00044437646865844724, "learning_rate": 0.0001, "loss": 1.3137, "loss/crossentropy": 2.6257591247558594, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1848209723830223, "step": 8912 }, { "epoch": 0.13309043669132964, "grad_norm": 0.365234375, "grad_norm_var": 0.0012841542561848958, "learning_rate": 0.0001, "loss": 1.6001, "loss/crossentropy": 2.2625043988227844, "loss/fcd": 1.3671875, "loss/idx": 10.5, "loss/logits": 0.23289518058300018, "step": 8913 }, { "epoch": 0.13310536886194463, "grad_norm": 0.43359375, "grad_norm_var": 0.0032662073771158855, "learning_rate": 0.0001, "loss": 1.6113, "loss/crossentropy": 2.9161194562911987, "loss/fcd": 1.37109375, "loss/idx": 10.5, "loss/logits": 0.24016255885362625, "step": 8914 }, { "epoch": 0.1331203010325596, "grad_norm": 0.26953125, "grad_norm_var": 0.0032662073771158855, "learning_rate": 0.0001, "loss": 1.5172, "loss/crossentropy": 2.751498579978943, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.25157737731933594, "step": 8915 }, { "epoch": 0.13313523320317458, "grad_norm": 0.251953125, "grad_norm_var": 0.0032497684160868326, "learning_rate": 0.0001, "loss": 1.2887, "loss/crossentropy": 2.6594839096069336, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1597955971956253, "step": 8916 }, { "epoch": 0.13315016537378957, "grad_norm": 0.2158203125, "grad_norm_var": 0.003298532962799072, "learning_rate": 0.0001, "loss": 1.2874, "loss/crossentropy": 2.8233633041381836, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.19364168494939804, "step": 8917 }, { "epoch": 0.13316509754440453, "grad_norm": 0.2099609375, "grad_norm_var": 0.0034740289052327474, "learning_rate": 0.0001, "loss": 1.3174, "loss/crossentropy": 2.2414721250534058, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.1846255511045456, "step": 8918 }, { "epoch": 0.13318002971501952, "grad_norm": 0.2431640625, "grad_norm_var": 0.003495307763417562, "learning_rate": 0.0001, "loss": 1.4544, "loss/crossentropy": 2.5645612478256226, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.23955465108156204, "step": 8919 }, { "epoch": 0.1331949618856345, "grad_norm": 0.240234375, "grad_norm_var": 0.003495307763417562, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.6313655376434326, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19862816482782364, "step": 8920 }, { "epoch": 0.1332098940562495, "grad_norm": 0.271484375, "grad_norm_var": 0.003392155965169271, "learning_rate": 0.0001, "loss": 1.4864, "loss/crossentropy": 2.6917500495910645, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.23244378715753555, "step": 8921 }, { "epoch": 0.13322482622686446, "grad_norm": 0.26171875, "grad_norm_var": 0.003332074483235677, "learning_rate": 0.0001, "loss": 1.2841, "loss/crossentropy": 2.6352198123931885, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.16304779052734375, "step": 8922 }, { "epoch": 0.13323975839747945, "grad_norm": 0.5078125, "grad_norm_var": 0.007012557983398437, "learning_rate": 0.0001, "loss": 1.6114, "loss/crossentropy": 3.030181407928467, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.25201184302568436, "step": 8923 }, { "epoch": 0.13325469056809444, "grad_norm": 0.28515625, "grad_norm_var": 0.007005437215169271, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.806355118751526, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.1873307228088379, "step": 8924 }, { "epoch": 0.1332696227387094, "grad_norm": 0.25390625, "grad_norm_var": 0.006947735945383708, "learning_rate": 0.0001, "loss": 1.327, "loss/crossentropy": 2.759392499923706, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1903233528137207, "step": 8925 }, { "epoch": 0.1332845549093244, "grad_norm": 0.2490234375, "grad_norm_var": 0.006974522272745768, "learning_rate": 0.0001, "loss": 1.3699, "loss/crossentropy": 2.4701567888259888, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1941021978855133, "step": 8926 }, { "epoch": 0.13329948707993938, "grad_norm": 0.23828125, "grad_norm_var": 0.006714753309885661, "learning_rate": 0.0001, "loss": 1.4991, "loss/crossentropy": 2.4915305376052856, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.24913573265075684, "step": 8927 }, { "epoch": 0.13331441925055434, "grad_norm": 0.251953125, "grad_norm_var": 0.006661208470662435, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.7508933544158936, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1927461251616478, "step": 8928 }, { "epoch": 0.13332935142116933, "grad_norm": 0.23046875, "grad_norm_var": 0.006342061360677083, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.642590880393982, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1809365376830101, "step": 8929 }, { "epoch": 0.13334428359178432, "grad_norm": 0.2421875, "grad_norm_var": 0.004606819152832032, "learning_rate": 0.0001, "loss": 1.3865, "loss/crossentropy": 2.674217462539673, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.19513268768787384, "step": 8930 }, { "epoch": 0.1333592157623993, "grad_norm": 0.236328125, "grad_norm_var": 0.00465086301167806, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.578761100769043, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.18288438767194748, "step": 8931 }, { "epoch": 0.13337414793301428, "grad_norm": 0.275390625, "grad_norm_var": 0.004654296239217122, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.4468761682510376, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.20806525647640228, "step": 8932 }, { "epoch": 0.13338908010362927, "grad_norm": 0.283203125, "grad_norm_var": 0.004511447747548422, "learning_rate": 0.0001, "loss": 1.4413, "loss/crossentropy": 2.6418585777282715, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.23040500283241272, "step": 8933 }, { "epoch": 0.13340401227424425, "grad_norm": 0.248046875, "grad_norm_var": 0.004309829076131185, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.3444600105285645, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.19863267242908478, "step": 8934 }, { "epoch": 0.13341894444485922, "grad_norm": 0.25, "grad_norm_var": 0.004288383324940999, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.6326462030410767, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.21505248546600342, "step": 8935 }, { "epoch": 0.1334338766154742, "grad_norm": 0.240234375, "grad_norm_var": 0.004288383324940999, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.54652738571167, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21690496802330017, "step": 8936 }, { "epoch": 0.1334488087860892, "grad_norm": 0.2578125, "grad_norm_var": 0.004297951857248942, "learning_rate": 0.0001, "loss": 1.4418, "loss/crossentropy": 2.5286964178085327, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.23087207973003387, "step": 8937 }, { "epoch": 0.13346374095670416, "grad_norm": 0.259765625, "grad_norm_var": 0.004300208886464437, "learning_rate": 0.0001, "loss": 1.3715, "loss/crossentropy": 2.3873337507247925, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.1918264552950859, "step": 8938 }, { "epoch": 0.13347867312731915, "grad_norm": 0.2734375, "grad_norm_var": 0.0002814253171284994, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.651992082595825, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.20105402171611786, "step": 8939 }, { "epoch": 0.13349360529793414, "grad_norm": 0.2275390625, "grad_norm_var": 0.0002549330393473307, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.5947564840316772, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19490541517734528, "step": 8940 }, { "epoch": 0.13350853746854913, "grad_norm": 0.2099609375, "grad_norm_var": 0.00035918156305948893, "learning_rate": 0.0001, "loss": 1.2678, "loss/crossentropy": 2.7288620471954346, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.18181587755680084, "step": 8941 }, { "epoch": 0.1335234696391641, "grad_norm": 0.46875, "grad_norm_var": 0.003396336237589518, "learning_rate": 0.0001, "loss": 1.7041, "loss/crossentropy": 2.420608639717102, "loss/fcd": 1.44921875, "loss/idx": 10.5, "loss/logits": 0.2548433318734169, "step": 8942 }, { "epoch": 0.13353840180977908, "grad_norm": 0.279296875, "grad_norm_var": 0.003371302286783854, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.839035987854004, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19839370995759964, "step": 8943 }, { "epoch": 0.13355333398039407, "grad_norm": 0.220703125, "grad_norm_var": 0.003485234578450521, "learning_rate": 0.0001, "loss": 1.4064, "loss/crossentropy": 2.438520312309265, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21888674795627594, "step": 8944 }, { "epoch": 0.13356826615100903, "grad_norm": 0.28125, "grad_norm_var": 0.003428204854329427, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.5005087852478027, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19183053076267242, "step": 8945 }, { "epoch": 0.13358319832162402, "grad_norm": 0.2265625, "grad_norm_var": 0.0034928003946940104, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.5670799016952515, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17584016919136047, "step": 8946 }, { "epoch": 0.133598130492239, "grad_norm": 0.291015625, "grad_norm_var": 0.0034714380900065104, "learning_rate": 0.0001, "loss": 1.4189, "loss/crossentropy": 2.4703657627105713, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.19622241705656052, "step": 8947 }, { "epoch": 0.133613062662854, "grad_norm": 0.2421875, "grad_norm_var": 0.0035089969635009764, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.5344208478927612, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18075183033943176, "step": 8948 }, { "epoch": 0.13362799483346896, "grad_norm": 0.21484375, "grad_norm_var": 0.003646405537923177, "learning_rate": 0.0001, "loss": 1.1812, "loss/crossentropy": 2.6964815855026245, "loss/fcd": 1.0234375, "loss/idx": 10.5, "loss/logits": 0.1577676385641098, "step": 8949 }, { "epoch": 0.13364292700408395, "grad_norm": 0.2001953125, "grad_norm_var": 0.0038783033688863117, "learning_rate": 0.0001, "loss": 1.2262, "loss/crossentropy": 2.4904603958129883, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.1558375433087349, "step": 8950 }, { "epoch": 0.13365785917469894, "grad_norm": 0.2470703125, "grad_norm_var": 0.0038823445638020834, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.934862732887268, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.23007888346910477, "step": 8951 }, { "epoch": 0.1336727913453139, "grad_norm": 0.24609375, "grad_norm_var": 0.003869994481404622, "learning_rate": 0.0001, "loss": 1.2221, "loss/crossentropy": 2.6315375566482544, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.17127789556980133, "step": 8952 }, { "epoch": 0.1336877235159289, "grad_norm": 0.259765625, "grad_norm_var": 0.0038698832194010418, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.359252095222473, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.17851439863443375, "step": 8953 }, { "epoch": 0.13370265568654388, "grad_norm": 0.2021484375, "grad_norm_var": 0.004073615868886312, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.533590078353882, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.16554398834705353, "step": 8954 }, { "epoch": 0.13371758785715884, "grad_norm": 0.37109375, "grad_norm_var": 0.004900928338368734, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.504981756210327, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.25883927941322327, "step": 8955 }, { "epoch": 0.13373252002777383, "grad_norm": 0.26171875, "grad_norm_var": 0.004817899068196615, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.667757034301758, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19468168914318085, "step": 8956 }, { "epoch": 0.13374745219838882, "grad_norm": 0.275390625, "grad_norm_var": 0.004614762465159098, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.415898561477661, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.21889524906873703, "step": 8957 }, { "epoch": 0.1337623843690038, "grad_norm": 0.21875, "grad_norm_var": 0.001829524834950765, "learning_rate": 0.0001, "loss": 1.3799, "loss/crossentropy": 2.5539461374282837, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19237779080867767, "step": 8958 }, { "epoch": 0.13377731653961877, "grad_norm": 0.220703125, "grad_norm_var": 0.001833816369374593, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.5905168056488037, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.1838412880897522, "step": 8959 }, { "epoch": 0.13379224871023376, "grad_norm": 0.236328125, "grad_norm_var": 0.0017907102902730307, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.646592617034912, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18095404654741287, "step": 8960 }, { "epoch": 0.13380718088084875, "grad_norm": 0.291015625, "grad_norm_var": 0.001837758223215739, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.4655840396881104, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.18763517588377, "step": 8961 }, { "epoch": 0.13382211305146371, "grad_norm": 0.25390625, "grad_norm_var": 0.001797926425933838, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.206857204437256, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.172943577170372, "step": 8962 }, { "epoch": 0.1338370452220787, "grad_norm": 0.234375, "grad_norm_var": 0.0017038941383361816, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.7301031351089478, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20092743635177612, "step": 8963 }, { "epoch": 0.1338519773926937, "grad_norm": 0.2421875, "grad_norm_var": 0.0017038941383361816, "learning_rate": 0.0001, "loss": 1.3665, "loss/crossentropy": 2.49771511554718, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1828952059149742, "step": 8964 }, { "epoch": 0.13386690956330868, "grad_norm": 0.2421875, "grad_norm_var": 0.0016280134518941243, "learning_rate": 0.0001, "loss": 1.3084, "loss/crossentropy": 2.621271014213562, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17944984138011932, "step": 8965 }, { "epoch": 0.13388184173392365, "grad_norm": 0.28125, "grad_norm_var": 0.00149839719136556, "learning_rate": 0.0001, "loss": 1.3694, "loss/crossentropy": 2.814085006713867, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19752178341150284, "step": 8966 }, { "epoch": 0.13389677390453864, "grad_norm": 0.2158203125, "grad_norm_var": 0.0015935103098551433, "learning_rate": 0.0001, "loss": 1.4109, "loss/crossentropy": 2.595519542694092, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.21947424113750458, "step": 8967 }, { "epoch": 0.13391170607515362, "grad_norm": 0.234375, "grad_norm_var": 0.0016133467356363933, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.3914698362350464, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.16541562229394913, "step": 8968 }, { "epoch": 0.1339266382457686, "grad_norm": 0.2236328125, "grad_norm_var": 0.0016602476437886555, "learning_rate": 0.0001, "loss": 1.2999, "loss/crossentropy": 2.5647459030151367, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1787952035665512, "step": 8969 }, { "epoch": 0.13394157041638358, "grad_norm": 0.2353515625, "grad_norm_var": 0.00151595671971639, "learning_rate": 0.0001, "loss": 1.3395, "loss/crossentropy": 2.562135338783264, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.18327580392360687, "step": 8970 }, { "epoch": 0.13395650258699857, "grad_norm": 0.486328125, "grad_norm_var": 0.004169873396555583, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6400972604751587, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.2842964306473732, "step": 8971 }, { "epoch": 0.13397143475761353, "grad_norm": 0.2138671875, "grad_norm_var": 0.004299354553222656, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.3736461400985718, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21490861475467682, "step": 8972 }, { "epoch": 0.13398636692822852, "grad_norm": 0.212890625, "grad_norm_var": 0.00438683827718099, "learning_rate": 0.0001, "loss": 1.2841, "loss/crossentropy": 2.6562159061431885, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.18255377560853958, "step": 8973 }, { "epoch": 0.1340012990988435, "grad_norm": 0.265625, "grad_norm_var": 0.00431207021077474, "learning_rate": 0.0001, "loss": 1.4322, "loss/crossentropy": 2.4346073865890503, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.19778963923454285, "step": 8974 }, { "epoch": 0.1340162312694585, "grad_norm": 0.2314453125, "grad_norm_var": 0.0042692780494689945, "learning_rate": 0.0001, "loss": 1.1551, "loss/crossentropy": 2.807101249694824, "loss/fcd": 1.01171875, "loss/idx": 10.5, "loss/logits": 0.14339295029640198, "step": 8975 }, { "epoch": 0.13403116344007346, "grad_norm": 0.25, "grad_norm_var": 0.004244577884674072, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 2.6911070346832275, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.22734621167182922, "step": 8976 }, { "epoch": 0.13404609561068845, "grad_norm": 0.2275390625, "grad_norm_var": 0.004209709167480469, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.419386148452759, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.20096468180418015, "step": 8977 }, { "epoch": 0.13406102778130344, "grad_norm": 0.25390625, "grad_norm_var": 0.004209709167480469, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.5132553577423096, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.21433867514133453, "step": 8978 }, { "epoch": 0.1340759599519184, "grad_norm": 0.2333984375, "grad_norm_var": 0.00421221653620402, "learning_rate": 0.0001, "loss": 1.3602, "loss/crossentropy": 2.6053664684295654, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19221963733434677, "step": 8979 }, { "epoch": 0.1340908921225334, "grad_norm": 0.2021484375, "grad_norm_var": 0.004370737075805664, "learning_rate": 0.0001, "loss": 1.2205, "loss/crossentropy": 2.6372387409210205, "loss/fcd": 1.05078125, "loss/idx": 10.5, "loss/logits": 0.16968395560979843, "step": 8980 }, { "epoch": 0.13410582429314838, "grad_norm": 0.259765625, "grad_norm_var": 0.004370307922363282, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.7273374795913696, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.22015609592199326, "step": 8981 }, { "epoch": 0.13412075646376337, "grad_norm": 0.234375, "grad_norm_var": 0.004323005676269531, "learning_rate": 0.0001, "loss": 1.3469, "loss/crossentropy": 2.6090868711471558, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19453642517328262, "step": 8982 }, { "epoch": 0.13413568863437833, "grad_norm": 0.2265625, "grad_norm_var": 0.004283010959625244, "learning_rate": 0.0001, "loss": 1.414, "loss/crossentropy": 2.5384570360183716, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21474747359752655, "step": 8983 }, { "epoch": 0.13415062080499332, "grad_norm": 0.228515625, "grad_norm_var": 0.004296934604644776, "learning_rate": 0.0001, "loss": 1.3816, "loss/crossentropy": 2.6817320585250854, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1980552151799202, "step": 8984 }, { "epoch": 0.1341655529756083, "grad_norm": 0.25390625, "grad_norm_var": 0.0042514801025390625, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.7038352489471436, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.2206534892320633, "step": 8985 }, { "epoch": 0.13418048514622327, "grad_norm": 0.240234375, "grad_norm_var": 0.00424279769261678, "learning_rate": 0.0001, "loss": 1.3131, "loss/crossentropy": 2.3085930347442627, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18033570051193237, "step": 8986 }, { "epoch": 0.13419541731683826, "grad_norm": 0.251953125, "grad_norm_var": 0.0003308256467183431, "learning_rate": 0.0001, "loss": 1.2754, "loss/crossentropy": 2.5831457376480103, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.16211149096488953, "step": 8987 }, { "epoch": 0.13421034948745325, "grad_norm": 0.259765625, "grad_norm_var": 0.0003231684366861979, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.62047278881073, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18094506114721298, "step": 8988 }, { "epoch": 0.1342252816580682, "grad_norm": 0.23828125, "grad_norm_var": 0.00027337074279785154, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.840290904045105, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17850589752197266, "step": 8989 }, { "epoch": 0.1342402138286832, "grad_norm": 0.2421875, "grad_norm_var": 0.00023102760314941406, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.5035955905914307, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.19014805555343628, "step": 8990 }, { "epoch": 0.1342551459992982, "grad_norm": 0.259765625, "grad_norm_var": 0.0002502719561258952, "learning_rate": 0.0001, "loss": 1.3135, "loss/crossentropy": 2.7613089084625244, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.1806730106472969, "step": 8991 }, { "epoch": 0.13427007816991318, "grad_norm": 0.26171875, "grad_norm_var": 0.00027230183283487953, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.228736639022827, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20544227957725525, "step": 8992 }, { "epoch": 0.13428501034052814, "grad_norm": 0.212890625, "grad_norm_var": 0.00031420389811197914, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.7183083295822144, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.18285758048295975, "step": 8993 }, { "epoch": 0.13429994251114313, "grad_norm": 0.2431640625, "grad_norm_var": 0.00030323266983032224, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.629292845726013, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19975679367780685, "step": 8994 }, { "epoch": 0.13431487468175812, "grad_norm": 0.26171875, "grad_norm_var": 0.00032639503479003906, "learning_rate": 0.0001, "loss": 1.3994, "loss/crossentropy": 2.5936681032180786, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2040819749236107, "step": 8995 }, { "epoch": 0.13432980685237308, "grad_norm": 0.2578125, "grad_norm_var": 0.00022197961807250975, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.3337674140930176, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.23605160415172577, "step": 8996 }, { "epoch": 0.13434473902298807, "grad_norm": 0.25390625, "grad_norm_var": 0.00021320581436157227, "learning_rate": 0.0001, "loss": 1.2892, "loss/crossentropy": 2.4656184911727905, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1759684458374977, "step": 8997 }, { "epoch": 0.13435967119360306, "grad_norm": 0.2353515625, "grad_norm_var": 0.00021182696024576824, "learning_rate": 0.0001, "loss": 1.4075, "loss/crossentropy": 2.3461859226226807, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19659055024385452, "step": 8998 }, { "epoch": 0.13437460336421803, "grad_norm": 0.232421875, "grad_norm_var": 0.00019919077555338542, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.403777837753296, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21337126195430756, "step": 8999 }, { "epoch": 0.13438953553483302, "grad_norm": 0.2216796875, "grad_norm_var": 0.0002179106076558431, "learning_rate": 0.0001, "loss": 1.3834, "loss/crossentropy": 2.6203662157058716, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.21151897311210632, "step": 9000 }, { "epoch": 0.134404467705448, "grad_norm": 0.2392578125, "grad_norm_var": 0.0002147515614827474, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.557120680809021, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19129469245672226, "step": 9001 }, { "epoch": 0.134419399876063, "grad_norm": 0.25390625, "grad_norm_var": 0.00021864573160807293, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.5147247314453125, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.22803860902786255, "step": 9002 }, { "epoch": 0.13443433204667796, "grad_norm": 0.205078125, "grad_norm_var": 0.0003147761027018229, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.5020923614501953, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16739538311958313, "step": 9003 }, { "epoch": 0.13444926421729295, "grad_norm": 0.2373046875, "grad_norm_var": 0.00029439528783162434, "learning_rate": 0.0001, "loss": 1.2586, "loss/crossentropy": 2.4640177488327026, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.16876360774040222, "step": 9004 }, { "epoch": 0.13446419638790794, "grad_norm": 0.2578125, "grad_norm_var": 0.0003110845883687337, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.6850340366363525, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.20160596072673798, "step": 9005 }, { "epoch": 0.1344791285585229, "grad_norm": 0.2353515625, "grad_norm_var": 0.0003140608469645182, "learning_rate": 0.0001, "loss": 1.3206, "loss/crossentropy": 2.457969307899475, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.19165068119764328, "step": 9006 }, { "epoch": 0.1344940607291379, "grad_norm": 0.25, "grad_norm_var": 0.00029665629069010416, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.46100115776062, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.19571398198604584, "step": 9007 }, { "epoch": 0.13450899289975288, "grad_norm": 0.2451171875, "grad_norm_var": 0.00026848713556925453, "learning_rate": 0.0001, "loss": 1.4531, "loss/crossentropy": 2.6816768646240234, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.21093697100877762, "step": 9008 }, { "epoch": 0.13452392507036787, "grad_norm": 0.265625, "grad_norm_var": 0.00025046269098917643, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.746898889541626, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20475879311561584, "step": 9009 }, { "epoch": 0.13453885724098283, "grad_norm": 0.2236328125, "grad_norm_var": 0.00027509927749633787, "learning_rate": 0.0001, "loss": 1.3806, "loss/crossentropy": 2.52741801738739, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.20089206099510193, "step": 9010 }, { "epoch": 0.13455378941159782, "grad_norm": 0.2373046875, "grad_norm_var": 0.00024897257486979166, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.5503664016723633, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2067095786333084, "step": 9011 }, { "epoch": 0.1345687215822128, "grad_norm": 0.23828125, "grad_norm_var": 0.00022830963134765626, "learning_rate": 0.0001, "loss": 1.3063, "loss/crossentropy": 2.442938804626465, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17739365994930267, "step": 9012 }, { "epoch": 0.13458365375282777, "grad_norm": 0.234375, "grad_norm_var": 0.00021464029947916666, "learning_rate": 0.0001, "loss": 1.5376, "loss/crossentropy": 2.456355571746826, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.2368568480014801, "step": 9013 }, { "epoch": 0.13459858592344276, "grad_norm": 0.2294921875, "grad_norm_var": 0.00021907488505045573, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.6660386323928833, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.1792365238070488, "step": 9014 }, { "epoch": 0.13461351809405775, "grad_norm": 0.234375, "grad_norm_var": 0.00021788279215494792, "learning_rate": 0.0001, "loss": 1.3011, "loss/crossentropy": 2.39298939704895, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17217986285686493, "step": 9015 }, { "epoch": 0.1346284502646727, "grad_norm": 0.22265625, "grad_norm_var": 0.00021581252415974934, "learning_rate": 0.0001, "loss": 1.3299, "loss/crossentropy": 2.4277820587158203, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.19321782141923904, "step": 9016 }, { "epoch": 0.1346433824352877, "grad_norm": 0.2119140625, "grad_norm_var": 0.0002583146095275879, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.442340135574341, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.1968545988202095, "step": 9017 }, { "epoch": 0.1346583146059027, "grad_norm": 0.240234375, "grad_norm_var": 0.00023806492487589518, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.687669277191162, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17987649142742157, "step": 9018 }, { "epoch": 0.13467324677651768, "grad_norm": 0.2333984375, "grad_norm_var": 0.000173187255859375, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.5733213424682617, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19359112530946732, "step": 9019 }, { "epoch": 0.13468817894713264, "grad_norm": 0.2236328125, "grad_norm_var": 0.00018486976623535156, "learning_rate": 0.0001, "loss": 1.3289, "loss/crossentropy": 2.6808425188064575, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19604454934597015, "step": 9020 }, { "epoch": 0.13470311111774763, "grad_norm": 0.255859375, "grad_norm_var": 0.00017954508463541668, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.6232876777648926, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.21691139042377472, "step": 9021 }, { "epoch": 0.13471804328836262, "grad_norm": 0.251953125, "grad_norm_var": 0.0001946091651916504, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.5918079614639282, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.17996758222579956, "step": 9022 }, { "epoch": 0.13473297545897758, "grad_norm": 0.24609375, "grad_norm_var": 0.0001889824867248535, "learning_rate": 0.0001, "loss": 1.4012, "loss/crossentropy": 2.347033381462097, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.1980540081858635, "step": 9023 }, { "epoch": 0.13474790762959257, "grad_norm": 0.2099609375, "grad_norm_var": 0.00022875070571899415, "learning_rate": 0.0001, "loss": 1.2876, "loss/crossentropy": 2.377698302268982, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18210018426179886, "step": 9024 }, { "epoch": 0.13476283980020756, "grad_norm": 0.2373046875, "grad_norm_var": 0.00016295115152994792, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.3454296588897705, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.1839846521615982, "step": 9025 }, { "epoch": 0.13477777197082255, "grad_norm": 0.2421875, "grad_norm_var": 0.00016091267267862955, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.490793466567993, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.1959376335144043, "step": 9026 }, { "epoch": 0.1347927041414375, "grad_norm": 0.216796875, "grad_norm_var": 0.00017902056376139323, "learning_rate": 0.0001, "loss": 1.3416, "loss/crossentropy": 2.4783581495285034, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.19315598905086517, "step": 9027 }, { "epoch": 0.1348076363120525, "grad_norm": 0.337890625, "grad_norm_var": 0.0008688608805338542, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.3399734497070312, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.1752723976969719, "step": 9028 }, { "epoch": 0.1348225684826675, "grad_norm": 0.2021484375, "grad_norm_var": 0.0009547511736551921, "learning_rate": 0.0001, "loss": 1.1886, "loss/crossentropy": 2.595216989517212, "loss/fcd": 1.02734375, "loss/idx": 10.5, "loss/logits": 0.16129020601511002, "step": 9029 }, { "epoch": 0.13483750065328245, "grad_norm": 0.248046875, "grad_norm_var": 0.0009570916493733724, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.675284504890442, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1727023646235466, "step": 9030 }, { "epoch": 0.13485243282389744, "grad_norm": 0.265625, "grad_norm_var": 0.0010013421376546224, "learning_rate": 0.0001, "loss": 1.4751, "loss/crossentropy": 2.650937080383301, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.225085087120533, "step": 9031 }, { "epoch": 0.13486736499451243, "grad_norm": 0.224609375, "grad_norm_var": 0.0009969711303710938, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.486043691635132, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18640732020139694, "step": 9032 }, { "epoch": 0.1348822971651274, "grad_norm": 0.236328125, "grad_norm_var": 0.0009412407875061035, "learning_rate": 0.0001, "loss": 1.2563, "loss/crossentropy": 2.6067928075790405, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1586657017469406, "step": 9033 }, { "epoch": 0.13489722933574239, "grad_norm": 0.263671875, "grad_norm_var": 0.0009700417518615723, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.275155544281006, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.2055302858352661, "step": 9034 }, { "epoch": 0.13491216150635738, "grad_norm": 0.220703125, "grad_norm_var": 0.000997161865234375, "learning_rate": 0.0001, "loss": 1.4015, "loss/crossentropy": 2.6291180849075317, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.22179225832223892, "step": 9035 }, { "epoch": 0.13492709367697236, "grad_norm": 0.251953125, "grad_norm_var": 0.0009753823280334473, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.5448265075683594, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.21976704895496368, "step": 9036 }, { "epoch": 0.13494202584758733, "grad_norm": 0.2158203125, "grad_norm_var": 0.0010146458943684896, "learning_rate": 0.0001, "loss": 1.3044, "loss/crossentropy": 2.5756319761276245, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1755363568663597, "step": 9037 }, { "epoch": 0.13495695801820232, "grad_norm": 0.259765625, "grad_norm_var": 0.0010288874308268228, "learning_rate": 0.0001, "loss": 1.4605, "loss/crossentropy": 2.869335174560547, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.230077363550663, "step": 9038 }, { "epoch": 0.1349718901888173, "grad_norm": 0.23828125, "grad_norm_var": 0.0010288874308268228, "learning_rate": 0.0001, "loss": 1.3206, "loss/crossentropy": 2.476270318031311, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1721959412097931, "step": 9039 }, { "epoch": 0.13498682235943227, "grad_norm": 0.236328125, "grad_norm_var": 0.0009599010149637858, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.6643149852752686, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21516184508800507, "step": 9040 }, { "epoch": 0.13500175453004726, "grad_norm": 0.2412109375, "grad_norm_var": 0.0009575804074605306, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.3977819681167603, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.21329400688409805, "step": 9041 }, { "epoch": 0.13501668670066225, "grad_norm": 0.2421875, "grad_norm_var": 0.0009575804074605306, "learning_rate": 0.0001, "loss": 1.5536, "loss/crossentropy": 2.500170588493347, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.24502845108509064, "step": 9042 }, { "epoch": 0.13503161887127724, "grad_norm": 0.251953125, "grad_norm_var": 0.0009080847104390462, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.5790162086486816, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17256193608045578, "step": 9043 }, { "epoch": 0.1350465510418922, "grad_norm": 0.263671875, "grad_norm_var": 0.00034335056940714516, "learning_rate": 0.0001, "loss": 1.3553, "loss/crossentropy": 2.5570948123931885, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1951497495174408, "step": 9044 }, { "epoch": 0.1350614832125072, "grad_norm": 0.2216796875, "grad_norm_var": 0.00026499032974243165, "learning_rate": 0.0001, "loss": 1.2655, "loss/crossentropy": 2.6968584060668945, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1678587794303894, "step": 9045 }, { "epoch": 0.13507641538312218, "grad_norm": 0.2314453125, "grad_norm_var": 0.0002701918284098307, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.7650192975997925, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1999959647655487, "step": 9046 }, { "epoch": 0.13509134755373714, "grad_norm": 0.24609375, "grad_norm_var": 0.00023140907287597657, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.6284289360046387, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.2044549435377121, "step": 9047 }, { "epoch": 0.13510627972435213, "grad_norm": 0.2109375, "grad_norm_var": 0.00027179718017578125, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.4181835651397705, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21379423141479492, "step": 9048 }, { "epoch": 0.13512121189496712, "grad_norm": 0.2255859375, "grad_norm_var": 0.0002835551897684733, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.483020782470703, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.19608157128095627, "step": 9049 }, { "epoch": 0.13513614406558208, "grad_norm": 0.234375, "grad_norm_var": 0.00024016300837198894, "learning_rate": 0.0001, "loss": 1.2445, "loss/crossentropy": 2.730392098426819, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16251524537801743, "step": 9050 }, { "epoch": 0.13515107623619707, "grad_norm": 0.22265625, "grad_norm_var": 0.00023615757624308268, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.6042819023132324, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.22244183719158173, "step": 9051 }, { "epoch": 0.13516600840681206, "grad_norm": 0.2412109375, "grad_norm_var": 0.00022212664286295572, "learning_rate": 0.0001, "loss": 1.3696, "loss/crossentropy": 2.6123266220092773, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1976771205663681, "step": 9052 }, { "epoch": 0.13518094057742705, "grad_norm": 0.26171875, "grad_norm_var": 0.00022754271825154622, "learning_rate": 0.0001, "loss": 1.5731, "loss/crossentropy": 2.7320475578308105, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.26843903958797455, "step": 9053 }, { "epoch": 0.135195872748042, "grad_norm": 0.2119140625, "grad_norm_var": 0.00024019877115885417, "learning_rate": 0.0001, "loss": 1.249, "loss/crossentropy": 2.462882876396179, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17083963006734848, "step": 9054 }, { "epoch": 0.135210804918657, "grad_norm": 0.25390625, "grad_norm_var": 0.0002595265706380208, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.6466023921966553, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.19264902919530869, "step": 9055 }, { "epoch": 0.135225737089272, "grad_norm": 0.232421875, "grad_norm_var": 0.00026098887125651043, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.6688451766967773, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1901998519897461, "step": 9056 }, { "epoch": 0.13524066925988695, "grad_norm": 0.2275390625, "grad_norm_var": 0.0002651055653889974, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.3701281547546387, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.19367988407611847, "step": 9057 }, { "epoch": 0.13525560143050194, "grad_norm": 0.318359375, "grad_norm_var": 0.000688489278157552, "learning_rate": 0.0001, "loss": 1.7199, "loss/crossentropy": 2.423821449279785, "loss/fcd": 1.46484375, "loss/idx": 10.5, "loss/logits": 0.25504622608423233, "step": 9058 }, { "epoch": 0.13527053360111693, "grad_norm": 0.2431640625, "grad_norm_var": 0.0006804426511128743, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.501355290412903, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18478579074144363, "step": 9059 }, { "epoch": 0.1352854657717319, "grad_norm": 0.30078125, "grad_norm_var": 0.0008815725644429525, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.6145607233047485, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.20308514684438705, "step": 9060 }, { "epoch": 0.13530039794234688, "grad_norm": 0.259765625, "grad_norm_var": 0.0008653004964192709, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.2915509939193726, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.1847349777817726, "step": 9061 }, { "epoch": 0.13531533011296187, "grad_norm": 0.25, "grad_norm_var": 0.0008529941240946452, "learning_rate": 0.0001, "loss": 1.3417, "loss/crossentropy": 2.9354753494262695, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19716832041740417, "step": 9062 }, { "epoch": 0.13533026228357686, "grad_norm": 0.23828125, "grad_norm_var": 0.0008569995562235515, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.5801289081573486, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.21891237795352936, "step": 9063 }, { "epoch": 0.13534519445419183, "grad_norm": 0.22265625, "grad_norm_var": 0.0008111278216044108, "learning_rate": 0.0001, "loss": 1.2943, "loss/crossentropy": 2.6330721378326416, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1810312569141388, "step": 9064 }, { "epoch": 0.13536012662480681, "grad_norm": 0.2373046875, "grad_norm_var": 0.0007869998613993326, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.5237537622451782, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2057824358344078, "step": 9065 }, { "epoch": 0.1353750587954218, "grad_norm": 0.2373046875, "grad_norm_var": 0.0007825056711832682, "learning_rate": 0.0001, "loss": 1.2913, "loss/crossentropy": 3.124777317047119, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17016025632619858, "step": 9066 }, { "epoch": 0.13538999096603677, "grad_norm": 0.279296875, "grad_norm_var": 0.0007958730061848958, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.703639268875122, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.2422095686197281, "step": 9067 }, { "epoch": 0.13540492313665176, "grad_norm": 0.2275390625, "grad_norm_var": 0.0008253574371337891, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.4593125581741333, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.21042537689208984, "step": 9068 }, { "epoch": 0.13541985530726675, "grad_norm": 0.2236328125, "grad_norm_var": 0.0008571267127990723, "learning_rate": 0.0001, "loss": 1.3608, "loss/crossentropy": 2.514841079711914, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19670559465885162, "step": 9069 }, { "epoch": 0.13543478747788174, "grad_norm": 0.2294921875, "grad_norm_var": 0.0007924675941467285, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.3905787467956543, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20419563353061676, "step": 9070 }, { "epoch": 0.1354497196484967, "grad_norm": 0.2412109375, "grad_norm_var": 0.0007939656575520833, "learning_rate": 0.0001, "loss": 1.3658, "loss/crossentropy": 2.2587920427322388, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.18607839941978455, "step": 9071 }, { "epoch": 0.1354646518191117, "grad_norm": 0.265625, "grad_norm_var": 0.0007936954498291016, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.5330291986465454, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.169776052236557, "step": 9072 }, { "epoch": 0.13547958398972668, "grad_norm": 0.294921875, "grad_norm_var": 0.0008745789527893066, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.379903554916382, "loss/fcd": 1.3828125, "loss/idx": 10.5, "loss/logits": 0.25903867185115814, "step": 9073 }, { "epoch": 0.13549451616034164, "grad_norm": 0.2265625, "grad_norm_var": 0.0006175955136617024, "learning_rate": 0.0001, "loss": 1.3024, "loss/crossentropy": 2.6958030462265015, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18908192217350006, "step": 9074 }, { "epoch": 0.13550944833095663, "grad_norm": 0.255859375, "grad_norm_var": 0.0006184736887613932, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.639586567878723, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19428808242082596, "step": 9075 }, { "epoch": 0.13552438050157162, "grad_norm": 0.208984375, "grad_norm_var": 0.0005161285400390625, "learning_rate": 0.0001, "loss": 1.2549, "loss/crossentropy": 2.682138681411743, "loss/fcd": 1.072265625, "loss/idx": 10.5, "loss/logits": 0.1826208531856537, "step": 9076 }, { "epoch": 0.13553931267218658, "grad_norm": 0.291015625, "grad_norm_var": 0.0006443023681640625, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.451165199279785, "loss/fcd": 1.3359375, "loss/idx": 10.5, "loss/logits": 0.24497700482606888, "step": 9077 }, { "epoch": 0.13555424484280157, "grad_norm": 0.2255859375, "grad_norm_var": 0.0006672501564025879, "learning_rate": 0.0001, "loss": 1.263, "loss/crossentropy": 2.6094084978103638, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.18488319963216782, "step": 9078 }, { "epoch": 0.13556917701341656, "grad_norm": 0.265625, "grad_norm_var": 0.0006928404172261555, "learning_rate": 0.0001, "loss": 1.3936, "loss/crossentropy": 2.708075165748596, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.21777500957250595, "step": 9079 }, { "epoch": 0.13558410918403155, "grad_norm": 0.236328125, "grad_norm_var": 0.0006623546282450358, "learning_rate": 0.0001, "loss": 1.3226, "loss/crossentropy": 2.548478126525879, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18974393606185913, "step": 9080 }, { "epoch": 0.1355990413546465, "grad_norm": 0.2431640625, "grad_norm_var": 0.0006572047869364421, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.513753294944763, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19034884870052338, "step": 9081 }, { "epoch": 0.1356139735252615, "grad_norm": 0.28125, "grad_norm_var": 0.0007210413614908855, "learning_rate": 0.0001, "loss": 1.4661, "loss/crossentropy": 2.51445472240448, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.20042936503887177, "step": 9082 }, { "epoch": 0.1356289056958765, "grad_norm": 0.255859375, "grad_norm_var": 0.0006630579630533854, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.5130467414855957, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.21928900480270386, "step": 9083 }, { "epoch": 0.13564383786649145, "grad_norm": 0.2373046875, "grad_norm_var": 0.0006419976552327473, "learning_rate": 0.0001, "loss": 1.279, "loss/crossentropy": 2.6864912509918213, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1813521757721901, "step": 9084 }, { "epoch": 0.13565877003710644, "grad_norm": 0.25390625, "grad_norm_var": 0.0005972822507222494, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.746057391166687, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20140758156776428, "step": 9085 }, { "epoch": 0.13567370220772143, "grad_norm": 0.2314453125, "grad_norm_var": 0.0005919734636942545, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.475345253944397, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17819667607545853, "step": 9086 }, { "epoch": 0.13568863437833642, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006105701128641764, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.458363175392151, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.15278272330760956, "step": 9087 }, { "epoch": 0.13570356654895138, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006137688954671224, "learning_rate": 0.0001, "loss": 1.3369, "loss/crossentropy": 2.4863134622573853, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19233392924070358, "step": 9088 }, { "epoch": 0.13571849871956637, "grad_norm": 0.2578125, "grad_norm_var": 0.0004685084025065104, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.460832118988037, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.19095491617918015, "step": 9089 }, { "epoch": 0.13573343089018136, "grad_norm": 0.26171875, "grad_norm_var": 0.00045534769694010414, "learning_rate": 0.0001, "loss": 1.4711, "loss/crossentropy": 2.4767255783081055, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.22111355513334274, "step": 9090 }, { "epoch": 0.13574836306079632, "grad_norm": 0.255859375, "grad_norm_var": 0.00045534769694010414, "learning_rate": 0.0001, "loss": 1.6222, "loss/crossentropy": 2.229658007621765, "loss/fcd": 1.4140625, "loss/idx": 10.5, "loss/logits": 0.2080930545926094, "step": 9091 }, { "epoch": 0.1357632952314113, "grad_norm": 0.2138671875, "grad_norm_var": 0.0004314064979553223, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.264620542526245, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.14907696098089218, "step": 9092 }, { "epoch": 0.1357782274020263, "grad_norm": 0.265625, "grad_norm_var": 0.0003272652626037598, "learning_rate": 0.0001, "loss": 1.3747, "loss/crossentropy": 2.4654934406280518, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.19501248747110367, "step": 9093 }, { "epoch": 0.13579315957264126, "grad_norm": 0.2353515625, "grad_norm_var": 0.0003056486447652181, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.7487775087356567, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.19377590715885162, "step": 9094 }, { "epoch": 0.13580809174325625, "grad_norm": 0.2060546875, "grad_norm_var": 0.00038248697916666666, "learning_rate": 0.0001, "loss": 1.2503, "loss/crossentropy": 2.4297244548797607, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1604987010359764, "step": 9095 }, { "epoch": 0.13582302391387124, "grad_norm": 0.220703125, "grad_norm_var": 0.00041300455729166666, "learning_rate": 0.0001, "loss": 1.1884, "loss/crossentropy": 2.5420001745224, "loss/fcd": 1.04296875, "loss/idx": 10.5, "loss/logits": 0.14548057317733765, "step": 9096 }, { "epoch": 0.13583795608448623, "grad_norm": 0.2373046875, "grad_norm_var": 0.00041476885477701825, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.4917300939559937, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.24158813059329987, "step": 9097 }, { "epoch": 0.1358528882551012, "grad_norm": 0.21875, "grad_norm_var": 0.00033440589904785154, "learning_rate": 0.0001, "loss": 1.2202, "loss/crossentropy": 2.6689499616622925, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.1576979011297226, "step": 9098 }, { "epoch": 0.13586782042571618, "grad_norm": 0.19140625, "grad_norm_var": 0.00044403076171875, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.7009271383285522, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.16614138334989548, "step": 9099 }, { "epoch": 0.13588275259633117, "grad_norm": 0.2412109375, "grad_norm_var": 0.00044651031494140624, "learning_rate": 0.0001, "loss": 1.3223, "loss/crossentropy": 2.4179420471191406, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18167608231306076, "step": 9100 }, { "epoch": 0.13589768476694614, "grad_norm": 0.216796875, "grad_norm_var": 0.0004371484120686849, "learning_rate": 0.0001, "loss": 1.2826, "loss/crossentropy": 2.5408668518066406, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.18107444792985916, "step": 9101 }, { "epoch": 0.13591261693756113, "grad_norm": 0.2578125, "grad_norm_var": 0.000477596124013265, "learning_rate": 0.0001, "loss": 1.3043, "loss/crossentropy": 2.6180925369262695, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1831919029355049, "step": 9102 }, { "epoch": 0.13592754910817612, "grad_norm": 0.3125, "grad_norm_var": 0.0008611679077148438, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.458434820175171, "loss/fcd": 1.3828125, "loss/idx": 10.5, "loss/logits": 0.23165678977966309, "step": 9103 }, { "epoch": 0.1359424812787911, "grad_norm": 0.25, "grad_norm_var": 0.0008639613787333171, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.692958354949951, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.20977020263671875, "step": 9104 }, { "epoch": 0.13595741344940607, "grad_norm": 0.275390625, "grad_norm_var": 0.0009246150652567546, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.8684136867523193, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.23557322472333908, "step": 9105 }, { "epoch": 0.13597234562002106, "grad_norm": 0.23828125, "grad_norm_var": 0.0008950511614481608, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.759668469429016, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.2260051593184471, "step": 9106 }, { "epoch": 0.13598727779063605, "grad_norm": 0.2431640625, "grad_norm_var": 0.0008779525756835938, "learning_rate": 0.0001, "loss": 1.2941, "loss/crossentropy": 2.683041572570801, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.16519015282392502, "step": 9107 }, { "epoch": 0.136002209961251, "grad_norm": 0.24609375, "grad_norm_var": 0.000834810733795166, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.5625816583633423, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1686927080154419, "step": 9108 }, { "epoch": 0.136017142131866, "grad_norm": 0.26171875, "grad_norm_var": 0.0008229533831278483, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.3454480171203613, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.2043808400630951, "step": 9109 }, { "epoch": 0.136032074302481, "grad_norm": 0.2236328125, "grad_norm_var": 0.0008400241533915202, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.6970643997192383, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17745943367481232, "step": 9110 }, { "epoch": 0.13604700647309595, "grad_norm": 0.2431640625, "grad_norm_var": 0.0007578810056050618, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.494803309440613, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1750611886382103, "step": 9111 }, { "epoch": 0.13606193864371094, "grad_norm": 0.228515625, "grad_norm_var": 0.000739125410715739, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.798251748085022, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.2015710100531578, "step": 9112 }, { "epoch": 0.13607687081432593, "grad_norm": 0.24609375, "grad_norm_var": 0.0007374445597330729, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.674428939819336, "loss/fcd": 1.072265625, "loss/idx": 10.5, "loss/logits": 0.1552228182554245, "step": 9113 }, { "epoch": 0.13609180298494092, "grad_norm": 0.232421875, "grad_norm_var": 0.0007041772206624349, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.614610433578491, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.2258758693933487, "step": 9114 }, { "epoch": 0.13610673515555588, "grad_norm": 0.2158203125, "grad_norm_var": 0.0005693713823954265, "learning_rate": 0.0001, "loss": 1.3217, "loss/crossentropy": 2.4699708223342896, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18499410897493362, "step": 9115 }, { "epoch": 0.13612166732617087, "grad_norm": 0.2431640625, "grad_norm_var": 0.0005684177080790202, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.3148434162139893, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17857858538627625, "step": 9116 }, { "epoch": 0.13613659949678586, "grad_norm": 0.291015625, "grad_norm_var": 0.0006245891253153483, "learning_rate": 0.0001, "loss": 1.6083, "loss/crossentropy": 2.594304323196411, "loss/fcd": 1.35546875, "loss/idx": 10.5, "loss/logits": 0.252826526761055, "step": 9117 }, { "epoch": 0.13615153166740082, "grad_norm": 0.25390625, "grad_norm_var": 0.0006217598915100097, "learning_rate": 0.0001, "loss": 1.4558, "loss/crossentropy": 2.351800560951233, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.20582307875156403, "step": 9118 }, { "epoch": 0.1361664638380158, "grad_norm": 0.22265625, "grad_norm_var": 0.0003812114397684733, "learning_rate": 0.0001, "loss": 1.2753, "loss/crossentropy": 2.8264445066452026, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.17767542600631714, "step": 9119 }, { "epoch": 0.1361813960086308, "grad_norm": 0.25390625, "grad_norm_var": 0.0003849307696024577, "learning_rate": 0.0001, "loss": 1.333, "loss/crossentropy": 2.4954386949539185, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.17679131031036377, "step": 9120 }, { "epoch": 0.13619632817924576, "grad_norm": 0.2578125, "grad_norm_var": 0.0003328601519266764, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.87446928024292, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1994745284318924, "step": 9121 }, { "epoch": 0.13621126034986075, "grad_norm": 0.279296875, "grad_norm_var": 0.00040762821833292644, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.623928189277649, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1942189708352089, "step": 9122 }, { "epoch": 0.13622619252047574, "grad_norm": 0.2412109375, "grad_norm_var": 0.0004087090492248535, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.583640694618225, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.1955074742436409, "step": 9123 }, { "epoch": 0.13624112469109073, "grad_norm": 0.2373046875, "grad_norm_var": 0.0004137516021728516, "learning_rate": 0.0001, "loss": 1.3967, "loss/crossentropy": 2.575296401977539, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.21702347695827484, "step": 9124 }, { "epoch": 0.1362560568617057, "grad_norm": 0.2373046875, "grad_norm_var": 0.00039894978205362954, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.7686296701431274, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.19475597143173218, "step": 9125 }, { "epoch": 0.13627098903232068, "grad_norm": 0.3671875, "grad_norm_var": 0.0012932459513346354, "learning_rate": 0.0001, "loss": 1.5061, "loss/crossentropy": 2.5847355127334595, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.22873596847057343, "step": 9126 }, { "epoch": 0.13628592120293567, "grad_norm": 0.404296875, "grad_norm_var": 0.0027009288469950357, "learning_rate": 0.0001, "loss": 1.5674, "loss/crossentropy": 2.510115385055542, "loss/fcd": 1.33984375, "loss/idx": 10.5, "loss/logits": 0.22759009152650833, "step": 9127 }, { "epoch": 0.13630085337355063, "grad_norm": 0.267578125, "grad_norm_var": 0.0026154160499572755, "learning_rate": 0.0001, "loss": 1.3749, "loss/crossentropy": 2.4079689979553223, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.17180316150188446, "step": 9128 }, { "epoch": 0.13631578554416562, "grad_norm": 0.2451171875, "grad_norm_var": 0.0026180267333984373, "learning_rate": 0.0001, "loss": 1.2771, "loss/crossentropy": 2.840963125228882, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.17555566132068634, "step": 9129 }, { "epoch": 0.1363307177147806, "grad_norm": 0.2236328125, "grad_norm_var": 0.002661764621734619, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.8121856451034546, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.19029565155506134, "step": 9130 }, { "epoch": 0.1363456498853956, "grad_norm": 0.25390625, "grad_norm_var": 0.002502298355102539, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.7678929567337036, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2075600028038025, "step": 9131 }, { "epoch": 0.13636058205601057, "grad_norm": 0.2431640625, "grad_norm_var": 0.002502298355102539, "learning_rate": 0.0001, "loss": 1.3163, "loss/crossentropy": 2.64204204082489, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1796204075217247, "step": 9132 }, { "epoch": 0.13637551422662555, "grad_norm": 0.2490234375, "grad_norm_var": 0.002480598290761312, "learning_rate": 0.0001, "loss": 1.4096, "loss/crossentropy": 2.7175354957580566, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.20643140375614166, "step": 9133 }, { "epoch": 0.13639044639724054, "grad_norm": 0.2392578125, "grad_norm_var": 0.0025153477986653646, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.630609393119812, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18517885357141495, "step": 9134 }, { "epoch": 0.1364053785678555, "grad_norm": 0.2412109375, "grad_norm_var": 0.0024347901344299316, "learning_rate": 0.0001, "loss": 1.3232, "loss/crossentropy": 2.62805712223053, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.17478622496128082, "step": 9135 }, { "epoch": 0.1364203107384705, "grad_norm": 0.21875, "grad_norm_var": 0.002564394474029541, "learning_rate": 0.0001, "loss": 1.2313, "loss/crossentropy": 2.770879030227661, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.1687762290239334, "step": 9136 }, { "epoch": 0.13643524290908549, "grad_norm": 0.375, "grad_norm_var": 0.0033435463905334474, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.636583089828491, "loss/fcd": 1.35546875, "loss/idx": 10.5, "loss/logits": 0.23699109256267548, "step": 9137 }, { "epoch": 0.13645017507970045, "grad_norm": 0.228515625, "grad_norm_var": 0.00344314177831014, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.473531723022461, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18690931797027588, "step": 9138 }, { "epoch": 0.13646510725031544, "grad_norm": 0.236328125, "grad_norm_var": 0.003461440404256185, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.4763684272766113, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20414890348911285, "step": 9139 }, { "epoch": 0.13648003942093043, "grad_norm": 0.255859375, "grad_norm_var": 0.003410176436106364, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.350777506828308, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.21725191920995712, "step": 9140 }, { "epoch": 0.13649497159154542, "grad_norm": 0.220703125, "grad_norm_var": 0.0034950892130533855, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.5191015005111694, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2218540459871292, "step": 9141 }, { "epoch": 0.13650990376216038, "grad_norm": 0.248046875, "grad_norm_var": 0.0027882734934488933, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 2.8680405616760254, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.21059739589691162, "step": 9142 }, { "epoch": 0.13652483593277537, "grad_norm": 0.283203125, "grad_norm_var": 0.0013652642567952473, "learning_rate": 0.0001, "loss": 1.4279, "loss/crossentropy": 2.680933356285095, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.2013252079486847, "step": 9143 }, { "epoch": 0.13653976810339036, "grad_norm": 0.23828125, "grad_norm_var": 0.0013573964436848958, "learning_rate": 0.0001, "loss": 1.3845, "loss/crossentropy": 2.707078695297241, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20868144929409027, "step": 9144 }, { "epoch": 0.13655470027400532, "grad_norm": 0.2138671875, "grad_norm_var": 0.0014387766520182292, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.379731774330139, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18312180787324905, "step": 9145 }, { "epoch": 0.1365696324446203, "grad_norm": 0.2353515625, "grad_norm_var": 0.0014092127482096355, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.723687529563904, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1988738626241684, "step": 9146 }, { "epoch": 0.1365845646152353, "grad_norm": 0.263671875, "grad_norm_var": 0.0014218489329020183, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.7709734439849854, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.18081742525100708, "step": 9147 }, { "epoch": 0.1365994967858503, "grad_norm": 0.23046875, "grad_norm_var": 0.0014424602190653483, "learning_rate": 0.0001, "loss": 1.443, "loss/crossentropy": 2.5858166217803955, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.228162944316864, "step": 9148 }, { "epoch": 0.13661442895646525, "grad_norm": 0.31640625, "grad_norm_var": 0.0017300764719645181, "learning_rate": 0.0001, "loss": 1.6928, "loss/crossentropy": 2.7829216718673706, "loss/fcd": 1.42578125, "loss/idx": 10.5, "loss/logits": 0.26697342097759247, "step": 9149 }, { "epoch": 0.13662936112708024, "grad_norm": 0.20703125, "grad_norm_var": 0.0018532077471415203, "learning_rate": 0.0001, "loss": 1.2362, "loss/crossentropy": 2.670426607131958, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16982188820838928, "step": 9150 }, { "epoch": 0.13664429329769523, "grad_norm": 0.2333984375, "grad_norm_var": 0.0018670042355855307, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 2.4881962537765503, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2211511731147766, "step": 9151 }, { "epoch": 0.1366592254683102, "grad_norm": 0.259765625, "grad_norm_var": 0.0017995794614156088, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.575109124183655, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.232187919318676, "step": 9152 }, { "epoch": 0.13667415763892518, "grad_norm": 0.236328125, "grad_norm_var": 0.0007432897885640463, "learning_rate": 0.0001, "loss": 1.1991, "loss/crossentropy": 2.5374629497528076, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.15227492153644562, "step": 9153 }, { "epoch": 0.13668908980954017, "grad_norm": 0.22265625, "grad_norm_var": 0.0007576902707417805, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.6171385049819946, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.16692180186510086, "step": 9154 }, { "epoch": 0.13670402198015513, "grad_norm": 0.2294921875, "grad_norm_var": 0.0007674535115559896, "learning_rate": 0.0001, "loss": 1.3687, "loss/crossentropy": 2.5488518476486206, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.2085919752717018, "step": 9155 }, { "epoch": 0.13671895415077012, "grad_norm": 0.234375, "grad_norm_var": 0.0007606347401936849, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.656474232673645, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21616242825984955, "step": 9156 }, { "epoch": 0.1367338863213851, "grad_norm": 0.220703125, "grad_norm_var": 0.0007606347401936849, "learning_rate": 0.0001, "loss": 1.3441, "loss/crossentropy": 2.601918935775757, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1995932161808014, "step": 9157 }, { "epoch": 0.1367488184920001, "grad_norm": 0.248046875, "grad_norm_var": 0.0007606347401936849, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.396089792251587, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.19667299836874008, "step": 9158 }, { "epoch": 0.13676375066261506, "grad_norm": 0.2421875, "grad_norm_var": 0.0006408055623372395, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.524492859840393, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18544571101665497, "step": 9159 }, { "epoch": 0.13677868283323005, "grad_norm": 0.265625, "grad_norm_var": 0.0006830851236979167, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.9174578189849854, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17267020046710968, "step": 9160 }, { "epoch": 0.13679361500384504, "grad_norm": 0.263671875, "grad_norm_var": 0.0006565372149149577, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.7489233016967773, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18266214430332184, "step": 9161 }, { "epoch": 0.13680854717446, "grad_norm": 0.255859375, "grad_norm_var": 0.0006582895914713542, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.483306407928467, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1888880729675293, "step": 9162 }, { "epoch": 0.136823479345075, "grad_norm": 0.25390625, "grad_norm_var": 0.0006407260894775391, "learning_rate": 0.0001, "loss": 1.4999, "loss/crossentropy": 2.673160195350647, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.23818232864141464, "step": 9163 }, { "epoch": 0.13683841151568998, "grad_norm": 0.236328125, "grad_norm_var": 0.0006315231323242188, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.848286509513855, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1965118795633316, "step": 9164 }, { "epoch": 0.13685334368630497, "grad_norm": 0.2236328125, "grad_norm_var": 0.00029064416885375975, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.5271923542022705, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.1636795923113823, "step": 9165 }, { "epoch": 0.13686827585691994, "grad_norm": 0.259765625, "grad_norm_var": 0.00023571252822875976, "learning_rate": 0.0001, "loss": 1.3849, "loss/crossentropy": 2.592473268508911, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19739097356796265, "step": 9166 }, { "epoch": 0.13688320802753492, "grad_norm": 0.236328125, "grad_norm_var": 0.00023255348205566406, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.6029754877090454, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17106252163648605, "step": 9167 }, { "epoch": 0.13689814019814991, "grad_norm": 0.271484375, "grad_norm_var": 0.00026726722717285156, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.447255849838257, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.2178295999765396, "step": 9168 }, { "epoch": 0.13691307236876488, "grad_norm": 0.220703125, "grad_norm_var": 0.0002980391184488932, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.3970640897750854, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21438640356063843, "step": 9169 }, { "epoch": 0.13692800453937987, "grad_norm": 0.2294921875, "grad_norm_var": 0.00028260151545206704, "learning_rate": 0.0001, "loss": 1.2943, "loss/crossentropy": 2.533852219581604, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.16927286982536316, "step": 9170 }, { "epoch": 0.13694293670999486, "grad_norm": 0.2421875, "grad_norm_var": 0.0002694288889567057, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.3491060733795166, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.2101437970995903, "step": 9171 }, { "epoch": 0.13695786888060982, "grad_norm": 0.2314453125, "grad_norm_var": 0.0002737323443094889, "learning_rate": 0.0001, "loss": 1.3696, "loss/crossentropy": 2.6185877323150635, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.2016267329454422, "step": 9172 }, { "epoch": 0.1369728010512248, "grad_norm": 0.2734375, "grad_norm_var": 0.0002848903338114421, "learning_rate": 0.0001, "loss": 1.3549, "loss/crossentropy": 2.4829803705215454, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.17908575385808945, "step": 9173 }, { "epoch": 0.1369877332218398, "grad_norm": 0.296875, "grad_norm_var": 0.0004398624102274577, "learning_rate": 0.0001, "loss": 1.5368, "loss/crossentropy": 2.684633493423462, "loss/fcd": 1.30078125, "loss/idx": 10.5, "loss/logits": 0.23600134253501892, "step": 9174 }, { "epoch": 0.1370026653924548, "grad_norm": 0.26953125, "grad_norm_var": 0.0004574418067932129, "learning_rate": 0.0001, "loss": 1.3784, "loss/crossentropy": 2.763710618019104, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20656999945640564, "step": 9175 }, { "epoch": 0.13701759756306975, "grad_norm": 0.251953125, "grad_norm_var": 0.0004440903663635254, "learning_rate": 0.0001, "loss": 1.2633, "loss/crossentropy": 2.4182533025741577, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.16171307861804962, "step": 9176 }, { "epoch": 0.13703252973368474, "grad_norm": 0.2490234375, "grad_norm_var": 0.0004328250885009766, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.592628598213196, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1912890449166298, "step": 9177 }, { "epoch": 0.13704746190429973, "grad_norm": 0.279296875, "grad_norm_var": 0.00048508644104003904, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.415190815925598, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.1958690732717514, "step": 9178 }, { "epoch": 0.1370623940749147, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005400300025939942, "learning_rate": 0.0001, "loss": 1.3385, "loss/crossentropy": 2.496687173843384, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.1861182078719139, "step": 9179 }, { "epoch": 0.13707732624552968, "grad_norm": 0.263671875, "grad_norm_var": 0.000538472334543864, "learning_rate": 0.0001, "loss": 1.4886, "loss/crossentropy": 2.6902090311050415, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.21121416985988617, "step": 9180 }, { "epoch": 0.13709225841614467, "grad_norm": 0.2255859375, "grad_norm_var": 0.0005315105120340983, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.5191508531570435, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17856717854738235, "step": 9181 }, { "epoch": 0.13710719058675963, "grad_norm": 0.2265625, "grad_norm_var": 0.0005633950233459473, "learning_rate": 0.0001, "loss": 1.2568, "loss/crossentropy": 2.3524091243743896, "loss/fcd": 1.083984375, "loss/idx": 10.5, "loss/logits": 0.17285270243883133, "step": 9182 }, { "epoch": 0.13712212275737462, "grad_norm": 0.25, "grad_norm_var": 0.0005513787269592285, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.629111886024475, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.19334828853607178, "step": 9183 }, { "epoch": 0.1371370549279896, "grad_norm": 0.322265625, "grad_norm_var": 0.00085677703221639, "learning_rate": 0.0001, "loss": 1.5627, "loss/crossentropy": 2.5339165925979614, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.2853336036205292, "step": 9184 }, { "epoch": 0.1371519870986046, "grad_norm": 0.2041015625, "grad_norm_var": 0.0009462833404541016, "learning_rate": 0.0001, "loss": 1.2175, "loss/crossentropy": 2.6889145374298096, "loss/fcd": 1.048828125, "loss/idx": 10.5, "loss/logits": 0.16863828897476196, "step": 9185 }, { "epoch": 0.13716691926921956, "grad_norm": 0.2236328125, "grad_norm_var": 0.0009662628173828125, "learning_rate": 0.0001, "loss": 1.4407, "loss/crossentropy": 2.5601370334625244, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.22199049592018127, "step": 9186 }, { "epoch": 0.13718185143983455, "grad_norm": 0.216796875, "grad_norm_var": 0.0010396162668863931, "learning_rate": 0.0001, "loss": 1.226, "loss/crossentropy": 2.4627625942230225, "loss/fcd": 1.072265625, "loss/idx": 10.5, "loss/logits": 0.1537710204720497, "step": 9187 }, { "epoch": 0.13719678361044954, "grad_norm": 0.21875, "grad_norm_var": 0.0010817170143127442, "learning_rate": 0.0001, "loss": 1.2926, "loss/crossentropy": 2.404692530632019, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17934474349021912, "step": 9188 }, { "epoch": 0.1372117157810645, "grad_norm": 0.33203125, "grad_norm_var": 0.0014827370643615723, "learning_rate": 0.0001, "loss": 1.6062, "loss/crossentropy": 2.51652193069458, "loss/fcd": 1.34765625, "loss/idx": 10.5, "loss/logits": 0.2585304379463196, "step": 9189 }, { "epoch": 0.1372266479516795, "grad_norm": 0.263671875, "grad_norm_var": 0.0013584415117899576, "learning_rate": 0.0001, "loss": 1.3655, "loss/crossentropy": 2.9135522842407227, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1897471398115158, "step": 9190 }, { "epoch": 0.13724158012229448, "grad_norm": 0.248046875, "grad_norm_var": 0.001334663232167562, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.398937225341797, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18748918920755386, "step": 9191 }, { "epoch": 0.13725651229290947, "grad_norm": 0.23046875, "grad_norm_var": 0.0013573924700419108, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.6184931993484497, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19395644217729568, "step": 9192 }, { "epoch": 0.13727144446352443, "grad_norm": 0.2353515625, "grad_norm_var": 0.0013680736223856608, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.887603998184204, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19698261469602585, "step": 9193 }, { "epoch": 0.13728637663413942, "grad_norm": 0.23828125, "grad_norm_var": 0.0012999812761942545, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.4972416162490845, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21949759870767593, "step": 9194 }, { "epoch": 0.1373013088047544, "grad_norm": 0.2470703125, "grad_norm_var": 0.00126113494237264, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.747400164604187, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19751641899347305, "step": 9195 }, { "epoch": 0.13731624097536937, "grad_norm": 0.25, "grad_norm_var": 0.001241775353749593, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.5609813928604126, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19097120314836502, "step": 9196 }, { "epoch": 0.13733117314598436, "grad_norm": 0.21484375, "grad_norm_var": 0.001277923583984375, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.6085487604141235, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1998017504811287, "step": 9197 }, { "epoch": 0.13734610531659935, "grad_norm": 0.23828125, "grad_norm_var": 0.0012575149536132812, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.7649872303009033, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.21378445625305176, "step": 9198 }, { "epoch": 0.13736103748721432, "grad_norm": 0.3359375, "grad_norm_var": 0.001766649881998698, "learning_rate": 0.0001, "loss": 1.7812, "loss/crossentropy": 2.2678275108337402, "loss/fcd": 1.4921875, "loss/idx": 10.5, "loss/logits": 0.2890237122774124, "step": 9199 }, { "epoch": 0.1373759696578293, "grad_norm": 0.205078125, "grad_norm_var": 0.001514879862467448, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.395118236541748, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2184540033340454, "step": 9200 }, { "epoch": 0.1373909018284443, "grad_norm": 0.357421875, "grad_norm_var": 0.00217055877049764, "learning_rate": 0.0001, "loss": 1.4999, "loss/crossentropy": 2.4111353158950806, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.19127864390611649, "step": 9201 }, { "epoch": 0.13740583399905928, "grad_norm": 0.265625, "grad_norm_var": 0.0021136601765950522, "learning_rate": 0.0001, "loss": 1.4291, "loss/crossentropy": 2.3940497636795044, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.20646100491285324, "step": 9202 }, { "epoch": 0.13742076616967425, "grad_norm": 0.2333984375, "grad_norm_var": 0.0020438790321350097, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.273738980293274, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.21550165116786957, "step": 9203 }, { "epoch": 0.13743569834028924, "grad_norm": 0.21484375, "grad_norm_var": 0.002064828077952067, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.5735210180282593, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19010405242443085, "step": 9204 }, { "epoch": 0.13745063051090423, "grad_norm": 0.2578125, "grad_norm_var": 0.0016655882199605307, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.6180495023727417, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.2381942719221115, "step": 9205 }, { "epoch": 0.1374655626815192, "grad_norm": 0.2421875, "grad_norm_var": 0.0016617417335510255, "learning_rate": 0.0001, "loss": 1.3484, "loss/crossentropy": 2.497321844100952, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.1921149641275406, "step": 9206 }, { "epoch": 0.13748049485213418, "grad_norm": 0.2734375, "grad_norm_var": 0.0016923228899637858, "learning_rate": 0.0001, "loss": 1.4195, "loss/crossentropy": 2.8270580768585205, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20853833109140396, "step": 9207 }, { "epoch": 0.13749542702274917, "grad_norm": 0.26171875, "grad_norm_var": 0.0016615509986877442, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.6208678483963013, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.21180111914873123, "step": 9208 }, { "epoch": 0.13751035919336416, "grad_norm": 0.30859375, "grad_norm_var": 0.0018102645874023438, "learning_rate": 0.0001, "loss": 1.5086, "loss/crossentropy": 2.576505422592163, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2429300621151924, "step": 9209 }, { "epoch": 0.13752529136397912, "grad_norm": 0.248046875, "grad_norm_var": 0.0017892042795817056, "learning_rate": 0.0001, "loss": 1.4185, "loss/crossentropy": 2.677477717399597, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.23097120225429535, "step": 9210 }, { "epoch": 0.1375402235345941, "grad_norm": 0.2255859375, "grad_norm_var": 0.001854070027669271, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.6378930807113647, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.2185460850596428, "step": 9211 }, { "epoch": 0.1375551557052091, "grad_norm": 0.26171875, "grad_norm_var": 0.001849683125813802, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.8314647674560547, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.22198127210140228, "step": 9212 }, { "epoch": 0.13757008787582406, "grad_norm": 0.2255859375, "grad_norm_var": 0.0017936031023661296, "learning_rate": 0.0001, "loss": 1.3217, "loss/crossentropy": 2.441297769546509, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19673200696706772, "step": 9213 }, { "epoch": 0.13758502004643905, "grad_norm": 0.23828125, "grad_norm_var": 0.0017936031023661296, "learning_rate": 0.0001, "loss": 1.3313, "loss/crossentropy": 2.6326977014541626, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1907035857439041, "step": 9214 }, { "epoch": 0.13759995221705404, "grad_norm": 0.208984375, "grad_norm_var": 0.0015105207761128743, "learning_rate": 0.0001, "loss": 1.3395, "loss/crossentropy": 2.690343499183655, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.19492223858833313, "step": 9215 }, { "epoch": 0.137614884387669, "grad_norm": 0.26953125, "grad_norm_var": 0.0013689001401265462, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.528008222579956, "loss/fcd": 1.34375, "loss/idx": 10.5, "loss/logits": 0.23954188078641891, "step": 9216 }, { "epoch": 0.137629816558284, "grad_norm": 0.275390625, "grad_norm_var": 0.0006779630978902181, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.570092558860779, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.22829680144786835, "step": 9217 }, { "epoch": 0.13764474872889898, "grad_norm": 0.2578125, "grad_norm_var": 0.000666201114654541, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.3238476514816284, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.22587919235229492, "step": 9218 }, { "epoch": 0.13765968089951397, "grad_norm": 0.2373046875, "grad_norm_var": 0.00065841277440389, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.6220271587371826, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18800482153892517, "step": 9219 }, { "epoch": 0.13767461307012893, "grad_norm": 0.255859375, "grad_norm_var": 0.0005689581235249837, "learning_rate": 0.0001, "loss": 1.4033, "loss/crossentropy": 2.7450355291366577, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.21580226719379425, "step": 9220 }, { "epoch": 0.13768954524074392, "grad_norm": 0.2138671875, "grad_norm_var": 0.0006614049275716146, "learning_rate": 0.0001, "loss": 1.2587, "loss/crossentropy": 2.6521400213241577, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.164969764649868, "step": 9221 }, { "epoch": 0.1377044774113589, "grad_norm": 0.2265625, "grad_norm_var": 0.0006934483846028646, "learning_rate": 0.0001, "loss": 1.4231, "loss/crossentropy": 2.375299096107483, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.2082795649766922, "step": 9222 }, { "epoch": 0.13771940958197387, "grad_norm": 0.265625, "grad_norm_var": 0.0006720860799153646, "learning_rate": 0.0001, "loss": 1.3329, "loss/crossentropy": 2.533980131149292, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18449628353118896, "step": 9223 }, { "epoch": 0.13773434175258886, "grad_norm": 0.2451171875, "grad_norm_var": 0.0006606698036193848, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.687705636024475, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18898358941078186, "step": 9224 }, { "epoch": 0.13774927392320385, "grad_norm": 0.31640625, "grad_norm_var": 0.0007278720537821451, "learning_rate": 0.0001, "loss": 1.5529, "loss/crossentropy": 2.504470467567444, "loss/fcd": 1.3359375, "loss/idx": 10.5, "loss/logits": 0.2169480323791504, "step": 9225 }, { "epoch": 0.13776420609381884, "grad_norm": 0.20703125, "grad_norm_var": 0.0008340160051981608, "learning_rate": 0.0001, "loss": 1.3446, "loss/crossentropy": 2.58572256565094, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.2000223472714424, "step": 9226 }, { "epoch": 0.1377791382644338, "grad_norm": 0.28125, "grad_norm_var": 0.0008786360422770183, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.5717045068740845, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.177979975938797, "step": 9227 }, { "epoch": 0.1377940704350488, "grad_norm": 0.30078125, "grad_norm_var": 0.0010394891103108724, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.656201124191284, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.21704521030187607, "step": 9228 }, { "epoch": 0.13780900260566378, "grad_norm": 0.2392578125, "grad_norm_var": 0.0010037740071614583, "learning_rate": 0.0001, "loss": 1.3051, "loss/crossentropy": 2.8412073850631714, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.19570914655923843, "step": 9229 }, { "epoch": 0.13782393477627874, "grad_norm": 0.283203125, "grad_norm_var": 0.001045083999633789, "learning_rate": 0.0001, "loss": 1.407, "loss/crossentropy": 2.6275291442871094, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.2077876329421997, "step": 9230 }, { "epoch": 0.13783886694689373, "grad_norm": 0.2353515625, "grad_norm_var": 0.0009258866310119628, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.610389232635498, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.18186041712760925, "step": 9231 }, { "epoch": 0.13785379911750872, "grad_norm": 0.2060546875, "grad_norm_var": 0.0010707855224609375, "learning_rate": 0.0001, "loss": 1.1775, "loss/crossentropy": 2.614522099494934, "loss/fcd": 1.021484375, "loss/idx": 10.5, "loss/logits": 0.1560133993625641, "step": 9232 }, { "epoch": 0.13786873128812369, "grad_norm": 0.2197265625, "grad_norm_var": 0.0010977387428283691, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.5671051740646362, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.191771037876606, "step": 9233 }, { "epoch": 0.13788366345873868, "grad_norm": 0.197265625, "grad_norm_var": 0.0012593547503153483, "learning_rate": 0.0001, "loss": 1.2653, "loss/crossentropy": 2.650517702102661, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17932742834091187, "step": 9234 }, { "epoch": 0.13789859562935367, "grad_norm": 0.3671875, "grad_norm_var": 0.002168893814086914, "learning_rate": 0.0001, "loss": 1.7975, "loss/crossentropy": 2.208650827407837, "loss/fcd": 1.4921875, "loss/idx": 10.5, "loss/logits": 0.3053550720214844, "step": 9235 }, { "epoch": 0.13791352779996865, "grad_norm": 0.2412109375, "grad_norm_var": 0.0021782517433166504, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.5540599822998047, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21836279332637787, "step": 9236 }, { "epoch": 0.13792845997058362, "grad_norm": 0.287109375, "grad_norm_var": 0.0021326541900634766, "learning_rate": 0.0001, "loss": 1.5063, "loss/crossentropy": 2.4214558601379395, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.19774606823921204, "step": 9237 }, { "epoch": 0.1379433921411986, "grad_norm": 0.2158203125, "grad_norm_var": 0.0021841009457906086, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.538418412208557, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.18887589871883392, "step": 9238 }, { "epoch": 0.1379583243118136, "grad_norm": 0.21875, "grad_norm_var": 0.002266116937001546, "learning_rate": 0.0001, "loss": 1.2386, "loss/crossentropy": 2.6595853567123413, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.17219798266887665, "step": 9239 }, { "epoch": 0.13797325648242856, "grad_norm": 0.259765625, "grad_norm_var": 0.0022624810536702473, "learning_rate": 0.0001, "loss": 1.4607, "loss/crossentropy": 2.509492516517639, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.20676523447036743, "step": 9240 }, { "epoch": 0.13798818865304355, "grad_norm": 0.203125, "grad_norm_var": 0.0021334171295166017, "learning_rate": 0.0001, "loss": 1.241, "loss/crossentropy": 2.4579875469207764, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16679684817790985, "step": 9241 }, { "epoch": 0.13800312082365854, "grad_norm": 0.2431640625, "grad_norm_var": 0.0020191788673400877, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.87295925617218, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.21015437692403793, "step": 9242 }, { "epoch": 0.1380180529942735, "grad_norm": 0.2314453125, "grad_norm_var": 0.0019662857055664064, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.5365735292434692, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2052556276321411, "step": 9243 }, { "epoch": 0.1380329851648885, "grad_norm": 0.224609375, "grad_norm_var": 0.0017809391021728516, "learning_rate": 0.0001, "loss": 1.4069, "loss/crossentropy": 2.683152914047241, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.2037625014781952, "step": 9244 }, { "epoch": 0.13804791733550348, "grad_norm": 0.19921875, "grad_norm_var": 0.0018961230913798015, "learning_rate": 0.0001, "loss": 1.1998, "loss/crossentropy": 2.413627505302429, "loss/fcd": 1.044921875, "loss/idx": 10.5, "loss/logits": 0.15491291135549545, "step": 9245 }, { "epoch": 0.13806284950611847, "grad_norm": 0.216796875, "grad_norm_var": 0.0017853379249572754, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.5470954179763794, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16408520936965942, "step": 9246 }, { "epoch": 0.13807778167673343, "grad_norm": 0.216796875, "grad_norm_var": 0.0018070062001546224, "learning_rate": 0.0001, "loss": 1.2532, "loss/crossentropy": 2.632246732711792, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17120381444692612, "step": 9247 }, { "epoch": 0.13809271384734842, "grad_norm": 0.23046875, "grad_norm_var": 0.00175246795018514, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.4757641553878784, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18165162950754166, "step": 9248 }, { "epoch": 0.1381076460179634, "grad_norm": 0.208984375, "grad_norm_var": 0.001782671610514323, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.6590739488601685, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17709524184465408, "step": 9249 }, { "epoch": 0.13812257818857837, "grad_norm": 0.279296875, "grad_norm_var": 0.0017893473307291667, "learning_rate": 0.0001, "loss": 1.7234, "loss/crossentropy": 2.709471821784973, "loss/fcd": 1.44921875, "loss/idx": 10.5, "loss/logits": 0.2741483002901077, "step": 9250 }, { "epoch": 0.13813751035919336, "grad_norm": 0.2138671875, "grad_norm_var": 0.0006632765134175618, "learning_rate": 0.0001, "loss": 1.3085, "loss/crossentropy": 2.602384924888611, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18354595452547073, "step": 9251 }, { "epoch": 0.13815244252980835, "grad_norm": 0.44921875, "grad_norm_var": 0.0036603291829427082, "learning_rate": 0.0001, "loss": 1.3799, "loss/crossentropy": 2.515756130218506, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.21194075793027878, "step": 9252 }, { "epoch": 0.13816737470042334, "grad_norm": 0.22265625, "grad_norm_var": 0.0035465081532796224, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.833655595779419, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19547592103481293, "step": 9253 }, { "epoch": 0.1381823068710383, "grad_norm": 0.216796875, "grad_norm_var": 0.0035434683163960774, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.5721940994262695, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.2043195590376854, "step": 9254 }, { "epoch": 0.1381972390416533, "grad_norm": 0.287109375, "grad_norm_var": 0.003644716739654541, "learning_rate": 0.0001, "loss": 1.6126, "loss/crossentropy": 2.467045545578003, "loss/fcd": 1.35546875, "loss/idx": 10.5, "loss/logits": 0.25712864100933075, "step": 9255 }, { "epoch": 0.13821217121226828, "grad_norm": 0.2119140625, "grad_norm_var": 0.003686968485514323, "learning_rate": 0.0001, "loss": 1.2978, "loss/crossentropy": 2.482333183288574, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18062114715576172, "step": 9256 }, { "epoch": 0.13822710338288324, "grad_norm": 0.2158203125, "grad_norm_var": 0.0036329865455627442, "learning_rate": 0.0001, "loss": 1.2963, "loss/crossentropy": 2.455147862434387, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1830403134226799, "step": 9257 }, { "epoch": 0.13824203555349823, "grad_norm": 0.2314453125, "grad_norm_var": 0.003639376163482666, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.68942928314209, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.20306271314620972, "step": 9258 }, { "epoch": 0.13825696772411322, "grad_norm": 0.2216796875, "grad_norm_var": 0.003657813866933187, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.506842255592346, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1876550391316414, "step": 9259 }, { "epoch": 0.13827189989472818, "grad_norm": 0.26171875, "grad_norm_var": 0.003665665785471598, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.5461217164993286, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.200393408536911, "step": 9260 }, { "epoch": 0.13828683206534317, "grad_norm": 0.2353515625, "grad_norm_var": 0.003537607192993164, "learning_rate": 0.0001, "loss": 1.3159, "loss/crossentropy": 2.4491230249404907, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1713893786072731, "step": 9261 }, { "epoch": 0.13830176423595816, "grad_norm": 0.275390625, "grad_norm_var": 0.0035318851470947264, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.4444624185562134, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2040245309472084, "step": 9262 }, { "epoch": 0.13831669640657315, "grad_norm": 0.2099609375, "grad_norm_var": 0.003563845157623291, "learning_rate": 0.0001, "loss": 1.249, "loss/crossentropy": 2.569401264190674, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.17872009426355362, "step": 9263 }, { "epoch": 0.13833162857718811, "grad_norm": 0.232421875, "grad_norm_var": 0.0035594582557678224, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.6036345958709717, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.1958724409341812, "step": 9264 }, { "epoch": 0.1383465607478031, "grad_norm": 0.2041015625, "grad_norm_var": 0.003586578369140625, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.129809319972992, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.2220645323395729, "step": 9265 }, { "epoch": 0.1383614929184181, "grad_norm": 0.2333984375, "grad_norm_var": 0.0035270015398661298, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.5262938737869263, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.19434572756290436, "step": 9266 }, { "epoch": 0.13837642508903306, "grad_norm": 0.39453125, "grad_norm_var": 0.0048127333323160805, "learning_rate": 0.0001, "loss": 1.4996, "loss/crossentropy": 2.311746835708618, "loss/fcd": 1.27734375, "loss/idx": 10.5, "loss/logits": 0.22229429334402084, "step": 9267 }, { "epoch": 0.13839135725964805, "grad_norm": 0.23046875, "grad_norm_var": 0.0021816094716389975, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.877925157546997, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.18044617027044296, "step": 9268 }, { "epoch": 0.13840628943026304, "grad_norm": 0.271484375, "grad_norm_var": 0.0021994908650716147, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.7170190811157227, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.22064576297998428, "step": 9269 }, { "epoch": 0.13842122160087802, "grad_norm": 0.2392578125, "grad_norm_var": 0.002144014835357666, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.6651724576950073, "loss/fcd": 1.107421875, "loss/idx": 10.5, "loss/logits": 0.18276185542345047, "step": 9270 }, { "epoch": 0.138436153771493, "grad_norm": 0.240234375, "grad_norm_var": 0.0020322442054748536, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.6929935216903687, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.2225918024778366, "step": 9271 }, { "epoch": 0.13845108594210798, "grad_norm": 0.3046875, "grad_norm_var": 0.0021692752838134766, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.6413676738739014, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2029976174235344, "step": 9272 }, { "epoch": 0.13846601811272297, "grad_norm": 0.240234375, "grad_norm_var": 0.002094868818918864, "learning_rate": 0.0001, "loss": 1.2273, "loss/crossentropy": 2.585928201675415, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.16477631032466888, "step": 9273 }, { "epoch": 0.13848095028333793, "grad_norm": 0.2353515625, "grad_norm_var": 0.002085300286610921, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.7082951068878174, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2141878306865692, "step": 9274 }, { "epoch": 0.13849588245395292, "grad_norm": 0.2412109375, "grad_norm_var": 0.002030464013417562, "learning_rate": 0.0001, "loss": 1.4443, "loss/crossentropy": 2.3716986179351807, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.20991931855678558, "step": 9275 }, { "epoch": 0.1385108146245679, "grad_norm": 0.220703125, "grad_norm_var": 0.0020885427792867025, "learning_rate": 0.0001, "loss": 1.4282, "loss/crossentropy": 2.6223210096359253, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2328624278306961, "step": 9276 }, { "epoch": 0.13852574679518287, "grad_norm": 0.234375, "grad_norm_var": 0.0020905812581380207, "learning_rate": 0.0001, "loss": 1.2846, "loss/crossentropy": 2.694027900695801, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.16743893921375275, "step": 9277 }, { "epoch": 0.13854067896579786, "grad_norm": 0.2177734375, "grad_norm_var": 0.002106757958730062, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.5709726810455322, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19481110572814941, "step": 9278 }, { "epoch": 0.13855561113641285, "grad_norm": 0.2490234375, "grad_norm_var": 0.0020098010698954265, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.656915307044983, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.20838867872953415, "step": 9279 }, { "epoch": 0.13857054330702784, "grad_norm": 0.2158203125, "grad_norm_var": 0.0020644505818684894, "learning_rate": 0.0001, "loss": 1.3278, "loss/crossentropy": 2.330836772918701, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.1911301612854004, "step": 9280 }, { "epoch": 0.1385854754776428, "grad_norm": 0.26171875, "grad_norm_var": 0.0019324580828348795, "learning_rate": 0.0001, "loss": 1.3897, "loss/crossentropy": 2.552095890045166, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.20611850917339325, "step": 9281 }, { "epoch": 0.1386004076482578, "grad_norm": 0.2421875, "grad_norm_var": 0.0019156138102213542, "learning_rate": 0.0001, "loss": 1.2123, "loss/crossentropy": 2.8153750896453857, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.15371018648147583, "step": 9282 }, { "epoch": 0.13861533981887278, "grad_norm": 0.2392578125, "grad_norm_var": 0.0004807750384012858, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.6011579036712646, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18695729225873947, "step": 9283 }, { "epoch": 0.13863027198948774, "grad_norm": 0.3125, "grad_norm_var": 0.0007671634356180827, "learning_rate": 0.0001, "loss": 1.6363, "loss/crossentropy": 2.8428103923797607, "loss/fcd": 1.375, "loss/idx": 10.5, "loss/logits": 0.26130181550979614, "step": 9284 }, { "epoch": 0.13864520416010273, "grad_norm": 0.2119140625, "grad_norm_var": 0.0008013407389322917, "learning_rate": 0.0001, "loss": 1.3034, "loss/crossentropy": 2.6200250387191772, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.19008885324001312, "step": 9285 }, { "epoch": 0.13866013633071772, "grad_norm": 0.29296875, "grad_norm_var": 0.0009466767311096192, "learning_rate": 0.0001, "loss": 1.5813, "loss/crossentropy": 2.4370826482772827, "loss/fcd": 1.328125, "loss/idx": 10.5, "loss/logits": 0.25322045385837555, "step": 9286 }, { "epoch": 0.1386750685013327, "grad_norm": 0.478515625, "grad_norm_var": 0.004264541467030843, "learning_rate": 0.0001, "loss": 1.5409, "loss/crossentropy": 2.578879237174988, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.2284269481897354, "step": 9287 }, { "epoch": 0.13869000067194767, "grad_norm": 0.275390625, "grad_norm_var": 0.004152961572011312, "learning_rate": 0.0001, "loss": 1.2203, "loss/crossentropy": 2.7286990880966187, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.1656324341893196, "step": 9288 }, { "epoch": 0.13870493284256266, "grad_norm": 0.3359375, "grad_norm_var": 0.004466052850087484, "learning_rate": 0.0001, "loss": 1.5937, "loss/crossentropy": 2.6713883876800537, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.26163432002067566, "step": 9289 }, { "epoch": 0.13871986501317765, "grad_norm": 0.267578125, "grad_norm_var": 0.004396947224934896, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.630445957183838, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18796399980783463, "step": 9290 }, { "epoch": 0.1387347971837926, "grad_norm": 0.22265625, "grad_norm_var": 0.004486111799875895, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.509289503097534, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.2176036313176155, "step": 9291 }, { "epoch": 0.1387497293544076, "grad_norm": 0.283203125, "grad_norm_var": 0.004341153303782145, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.396038770675659, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.2054658830165863, "step": 9292 }, { "epoch": 0.1387646615250226, "grad_norm": 0.2197265625, "grad_norm_var": 0.00442668596903483, "learning_rate": 0.0001, "loss": 1.275, "loss/crossentropy": 2.7434147596359253, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1851794421672821, "step": 9293 }, { "epoch": 0.13877959369563755, "grad_norm": 0.2578125, "grad_norm_var": 0.0042460083961486815, "learning_rate": 0.0001, "loss": 1.306, "loss/crossentropy": 2.6961392164230347, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18102890253067017, "step": 9294 }, { "epoch": 0.13879452586625254, "grad_norm": 0.2734375, "grad_norm_var": 0.004205576578776042, "learning_rate": 0.0001, "loss": 1.4622, "loss/crossentropy": 2.5024391412734985, "loss/fcd": 1.2578125, "loss/idx": 10.5, "loss/logits": 0.2043934389948845, "step": 9295 }, { "epoch": 0.13880945803686753, "grad_norm": 0.2392578125, "grad_norm_var": 0.004056803385416667, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.370948553085327, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.19380419701337814, "step": 9296 }, { "epoch": 0.13882439020748252, "grad_norm": 0.21484375, "grad_norm_var": 0.004282633463541667, "learning_rate": 0.0001, "loss": 1.3193, "loss/crossentropy": 2.679005265235901, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19430096447467804, "step": 9297 }, { "epoch": 0.13883932237809749, "grad_norm": 0.255859375, "grad_norm_var": 0.004238239924112956, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.856558918952942, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19445601850748062, "step": 9298 }, { "epoch": 0.13885425454871247, "grad_norm": 0.275390625, "grad_norm_var": 0.004153406620025635, "learning_rate": 0.0001, "loss": 1.3068, "loss/crossentropy": 2.787700057029724, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.17786559462547302, "step": 9299 }, { "epoch": 0.13886918671932746, "grad_norm": 0.220703125, "grad_norm_var": 0.004234087467193603, "learning_rate": 0.0001, "loss": 1.3153, "loss/crossentropy": 2.595799684524536, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.16684876382350922, "step": 9300 }, { "epoch": 0.13888411888994243, "grad_norm": 0.2294921875, "grad_norm_var": 0.0041164994239807125, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.728354334831238, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.1778307780623436, "step": 9301 }, { "epoch": 0.13889905106055742, "grad_norm": 0.255859375, "grad_norm_var": 0.004095963637034098, "learning_rate": 0.0001, "loss": 1.3906, "loss/crossentropy": 2.8091561794281006, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1952924057841301, "step": 9302 }, { "epoch": 0.1389139832311724, "grad_norm": 0.2734375, "grad_norm_var": 0.0009984294573465982, "learning_rate": 0.0001, "loss": 1.5493, "loss/crossentropy": 2.2864131927490234, "loss/fcd": 1.3203125, "loss/idx": 10.5, "loss/logits": 0.22896423935890198, "step": 9303 }, { "epoch": 0.13892891540178737, "grad_norm": 0.2216796875, "grad_norm_var": 0.0010419209798177083, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.6914197206497192, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2124500498175621, "step": 9304 }, { "epoch": 0.13894384757240236, "grad_norm": 0.216796875, "grad_norm_var": 0.0006104628245035807, "learning_rate": 0.0001, "loss": 1.2833, "loss/crossentropy": 2.68544340133667, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.17786473780870438, "step": 9305 }, { "epoch": 0.13895877974301735, "grad_norm": 0.201171875, "grad_norm_var": 0.0006904443105061849, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.7323802709579468, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.1864730268716812, "step": 9306 }, { "epoch": 0.13897371191363234, "grad_norm": 0.2392578125, "grad_norm_var": 0.0006663282712300618, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.660648822784424, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19226066023111343, "step": 9307 }, { "epoch": 0.1389886440842473, "grad_norm": 0.2265625, "grad_norm_var": 0.0005584677060445149, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.4094643592834473, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16573724895715714, "step": 9308 }, { "epoch": 0.1390035762548623, "grad_norm": 0.30859375, "grad_norm_var": 0.0008256912231445312, "learning_rate": 0.0001, "loss": 1.3493, "loss/crossentropy": 2.8892029523849487, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19308098405599594, "step": 9309 }, { "epoch": 0.13901850842547728, "grad_norm": 0.232421875, "grad_norm_var": 0.0008205254872639974, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.616760015487671, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18914546817541122, "step": 9310 }, { "epoch": 0.13903344059609224, "grad_norm": 0.2373046875, "grad_norm_var": 0.0007545113563537598, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.5503426790237427, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.20477799326181412, "step": 9311 }, { "epoch": 0.13904837276670723, "grad_norm": 0.3046875, "grad_norm_var": 0.001010894775390625, "learning_rate": 0.0001, "loss": 1.588, "loss/crossentropy": 2.64638090133667, "loss/fcd": 1.359375, "loss/idx": 10.5, "loss/logits": 0.22864307463169098, "step": 9312 }, { "epoch": 0.13906330493732222, "grad_norm": 0.259765625, "grad_norm_var": 0.0009586175282796223, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.2575515508651733, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20605768263339996, "step": 9313 }, { "epoch": 0.1390782371079372, "grad_norm": 0.228515625, "grad_norm_var": 0.0009746392567952474, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.5320805311203003, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.20439204573631287, "step": 9314 }, { "epoch": 0.13909316927855217, "grad_norm": 0.251953125, "grad_norm_var": 0.0009162743886311849, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.639780640602112, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.2560637295246124, "step": 9315 }, { "epoch": 0.13910810144916716, "grad_norm": 0.228515625, "grad_norm_var": 0.000895547866821289, "learning_rate": 0.0001, "loss": 1.3504, "loss/crossentropy": 2.5951889753341675, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.2136731743812561, "step": 9316 }, { "epoch": 0.13912303361978215, "grad_norm": 0.2392578125, "grad_norm_var": 0.0008816401163736979, "learning_rate": 0.0001, "loss": 1.3631, "loss/crossentropy": 2.49773633480072, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19907590746879578, "step": 9317 }, { "epoch": 0.1391379657903971, "grad_norm": 0.298828125, "grad_norm_var": 0.0010571797688802083, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.734064221382141, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20206695050001144, "step": 9318 }, { "epoch": 0.1391528979610121, "grad_norm": 0.2236328125, "grad_norm_var": 0.0010436018308003744, "learning_rate": 0.0001, "loss": 1.3414, "loss/crossentropy": 2.5603736639022827, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.18126246333122253, "step": 9319 }, { "epoch": 0.1391678301316271, "grad_norm": 0.22265625, "grad_norm_var": 0.00104063351949056, "learning_rate": 0.0001, "loss": 1.2704, "loss/crossentropy": 2.4921475648880005, "loss/fcd": 1.091796875, "loss/idx": 10.5, "loss/logits": 0.17862620949745178, "step": 9320 }, { "epoch": 0.13918276230224205, "grad_norm": 0.263671875, "grad_norm_var": 0.001001723607381185, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.4519439935684204, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17955946177244186, "step": 9321 }, { "epoch": 0.13919769447285704, "grad_norm": 0.2392578125, "grad_norm_var": 0.000854965051015218, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.432509183883667, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.22264619171619415, "step": 9322 }, { "epoch": 0.13921262664347203, "grad_norm": 0.2197265625, "grad_norm_var": 0.0009075760841369629, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.36299204826355, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.20340389758348465, "step": 9323 }, { "epoch": 0.13922755881408702, "grad_norm": 0.2265625, "grad_norm_var": 0.0009075760841369629, "learning_rate": 0.0001, "loss": 1.3342, "loss/crossentropy": 2.6924535036087036, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1897047907114029, "step": 9324 }, { "epoch": 0.13924249098470198, "grad_norm": 0.25, "grad_norm_var": 0.0006572365760803223, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.6292084455490112, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.20518849790096283, "step": 9325 }, { "epoch": 0.13925742315531697, "grad_norm": 0.296875, "grad_norm_var": 0.0008051514625549317, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.4677956104278564, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.1724494993686676, "step": 9326 }, { "epoch": 0.13927235532593196, "grad_norm": 0.271484375, "grad_norm_var": 0.0008228143056233724, "learning_rate": 0.0001, "loss": 1.3571, "loss/crossentropy": 2.533463478088379, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18909919261932373, "step": 9327 }, { "epoch": 0.13928728749654692, "grad_norm": 0.31640625, "grad_norm_var": 0.0009143670399983724, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.760127067565918, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.21819724142551422, "step": 9328 }, { "epoch": 0.13930221966716191, "grad_norm": 0.2294921875, "grad_norm_var": 0.0009415904680887858, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.598496198654175, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1603056788444519, "step": 9329 }, { "epoch": 0.1393171518377769, "grad_norm": 0.22265625, "grad_norm_var": 0.0009608546892801921, "learning_rate": 0.0001, "loss": 1.2612, "loss/crossentropy": 2.5162200927734375, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17527178674936295, "step": 9330 }, { "epoch": 0.1393320840083919, "grad_norm": 0.2421875, "grad_norm_var": 0.000964351495107015, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.647732734680176, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2106558382511139, "step": 9331 }, { "epoch": 0.13934701617900686, "grad_norm": 0.2294921875, "grad_norm_var": 0.0009616851806640625, "learning_rate": 0.0001, "loss": 1.2328, "loss/crossentropy": 2.5625083446502686, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.17025452107191086, "step": 9332 }, { "epoch": 0.13936194834962184, "grad_norm": 0.2431640625, "grad_norm_var": 0.0009572982788085937, "learning_rate": 0.0001, "loss": 1.2934, "loss/crossentropy": 2.5522042512893677, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17233861982822418, "step": 9333 }, { "epoch": 0.13937688052023683, "grad_norm": 0.2158203125, "grad_norm_var": 0.0008448243141174316, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.2036937475204468, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.18346232175827026, "step": 9334 }, { "epoch": 0.1393918126908518, "grad_norm": 0.255859375, "grad_norm_var": 0.0008197784423828125, "learning_rate": 0.0001, "loss": 1.5962, "loss/crossentropy": 2.5998224020004272, "loss/fcd": 1.33203125, "loss/idx": 10.5, "loss/logits": 0.2641691043972969, "step": 9335 }, { "epoch": 0.13940674486146679, "grad_norm": 0.275390625, "grad_norm_var": 0.0008253574371337891, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.387652039527893, "loss/fcd": 1.34765625, "loss/idx": 10.5, "loss/logits": 0.2675458490848541, "step": 9336 }, { "epoch": 0.13942167703208178, "grad_norm": 0.3828125, "grad_norm_var": 0.0019316355387369792, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.0333293080329895, "loss/fcd": 1.38671875, "loss/idx": 10.5, "loss/logits": 0.18688343465328217, "step": 9337 }, { "epoch": 0.13943660920269674, "grad_norm": 0.23046875, "grad_norm_var": 0.0019576350847880044, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.7994301319122314, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1842639297246933, "step": 9338 }, { "epoch": 0.13945154137331173, "grad_norm": 0.255859375, "grad_norm_var": 0.0018607457478841147, "learning_rate": 0.0001, "loss": 1.4782, "loss/crossentropy": 2.7524856328964233, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.2282087430357933, "step": 9339 }, { "epoch": 0.13946647354392672, "grad_norm": 0.25390625, "grad_norm_var": 0.001789093017578125, "learning_rate": 0.0001, "loss": 1.4968, "loss/crossentropy": 2.7598663568496704, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.2350352704524994, "step": 9340 }, { "epoch": 0.1394814057145417, "grad_norm": 0.2451171875, "grad_norm_var": 0.0017975767453511555, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.6995104551315308, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1742800921201706, "step": 9341 }, { "epoch": 0.13949633788515667, "grad_norm": 0.248046875, "grad_norm_var": 0.0017093618710835774, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.7573533058166504, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.20103494077920914, "step": 9342 }, { "epoch": 0.13951127005577166, "grad_norm": 0.251953125, "grad_norm_var": 0.001696487267812093, "learning_rate": 0.0001, "loss": 1.3475, "loss/crossentropy": 2.6734496355056763, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1990392804145813, "step": 9343 }, { "epoch": 0.13952620222638665, "grad_norm": 0.265625, "grad_norm_var": 0.0014497717221577963, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 2.5999977588653564, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.2144872099161148, "step": 9344 }, { "epoch": 0.1395411343970016, "grad_norm": 0.2578125, "grad_norm_var": 0.0014111677805582683, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 2.404776930809021, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.2597474902868271, "step": 9345 }, { "epoch": 0.1395560665676166, "grad_norm": 0.2265625, "grad_norm_var": 0.001395400365193685, "learning_rate": 0.0001, "loss": 1.3026, "loss/crossentropy": 2.5220123529434204, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18935153633356094, "step": 9346 }, { "epoch": 0.1395709987382316, "grad_norm": 0.2578125, "grad_norm_var": 0.00138395627339681, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.5931252241134644, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.18336325883865356, "step": 9347 }, { "epoch": 0.13958593090884658, "grad_norm": 0.33984375, "grad_norm_var": 0.0017552971839904785, "learning_rate": 0.0001, "loss": 1.5856, "loss/crossentropy": 2.3005651235580444, "loss/fcd": 1.3671875, "loss/idx": 10.5, "loss/logits": 0.21838000416755676, "step": 9348 }, { "epoch": 0.13960086307946154, "grad_norm": 0.251953125, "grad_norm_var": 0.0017370223999023438, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.4945400953292847, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.21158810704946518, "step": 9349 }, { "epoch": 0.13961579525007653, "grad_norm": 0.2236328125, "grad_norm_var": 0.0016912460327148438, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.5730133056640625, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20308037102222443, "step": 9350 }, { "epoch": 0.13963072742069152, "grad_norm": 0.234375, "grad_norm_var": 0.001743173599243164, "learning_rate": 0.0001, "loss": 1.3445, "loss/crossentropy": 2.7462915182113647, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19213319569826126, "step": 9351 }, { "epoch": 0.13964565959130648, "grad_norm": 0.2275390625, "grad_norm_var": 0.001804506778717041, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.6424968242645264, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17279192805290222, "step": 9352 }, { "epoch": 0.13966059176192147, "grad_norm": 0.23046875, "grad_norm_var": 0.0007519364356994629, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.536076784133911, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1828458458185196, "step": 9353 }, { "epoch": 0.13967552393253646, "grad_norm": 0.2265625, "grad_norm_var": 0.000763094425201416, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.5964611768722534, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.15712198615074158, "step": 9354 }, { "epoch": 0.13969045610315142, "grad_norm": 0.2265625, "grad_norm_var": 0.0007931351661682129, "learning_rate": 0.0001, "loss": 1.2832, "loss/crossentropy": 2.6997064352035522, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.17384489625692368, "step": 9355 }, { "epoch": 0.1397053882737664, "grad_norm": 0.2158203125, "grad_norm_var": 0.000853729248046875, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.6847126483917236, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.17424454540014267, "step": 9356 }, { "epoch": 0.1397203204443814, "grad_norm": 0.21875, "grad_norm_var": 0.000898897647857666, "learning_rate": 0.0001, "loss": 1.3677, "loss/crossentropy": 2.693748712539673, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20756947994232178, "step": 9357 }, { "epoch": 0.1397352526149964, "grad_norm": 0.2470703125, "grad_norm_var": 0.000898424784342448, "learning_rate": 0.0001, "loss": 1.3897, "loss/crossentropy": 2.643229365348816, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.20223893970251083, "step": 9358 }, { "epoch": 0.13975018478561135, "grad_norm": 0.287109375, "grad_norm_var": 0.0010134379069010417, "learning_rate": 0.0001, "loss": 1.3603, "loss/crossentropy": 2.6802397966384888, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.1883796900510788, "step": 9359 }, { "epoch": 0.13976511695622634, "grad_norm": 0.255859375, "grad_norm_var": 0.000993967056274414, "learning_rate": 0.0001, "loss": 1.4434, "loss/crossentropy": 2.5440211296081543, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.20125054568052292, "step": 9360 }, { "epoch": 0.13978004912684133, "grad_norm": 0.2041015625, "grad_norm_var": 0.0010859767595926921, "learning_rate": 0.0001, "loss": 1.3038, "loss/crossentropy": 2.639333963394165, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.19439156353473663, "step": 9361 }, { "epoch": 0.1397949812974563, "grad_norm": 0.240234375, "grad_norm_var": 0.0010692874590555828, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.4610507488250732, "loss/fcd": 1.19921875, "loss/idx": 10.5, "loss/logits": 0.21183635294437408, "step": 9362 }, { "epoch": 0.13980991346807128, "grad_norm": 0.265625, "grad_norm_var": 0.0010885516802469888, "learning_rate": 0.0001, "loss": 1.2388, "loss/crossentropy": 2.490389823913574, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.164572574198246, "step": 9363 }, { "epoch": 0.13982484563868627, "grad_norm": 0.2236328125, "grad_norm_var": 0.00043931007385253904, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.5170371532440186, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20604997128248215, "step": 9364 }, { "epoch": 0.13983977780930124, "grad_norm": 0.283203125, "grad_norm_var": 0.0005659580230712891, "learning_rate": 0.0001, "loss": 1.3528, "loss/crossentropy": 2.8670246601104736, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.2004767581820488, "step": 9365 }, { "epoch": 0.13985470997991623, "grad_norm": 0.267578125, "grad_norm_var": 0.0006015419960021973, "learning_rate": 0.0001, "loss": 1.3358, "loss/crossentropy": 2.590936303138733, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.19516436010599136, "step": 9366 }, { "epoch": 0.13986964215053121, "grad_norm": 0.2333984375, "grad_norm_var": 0.0006024519602457683, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.8796170949935913, "loss/fcd": 1.04296875, "loss/idx": 10.5, "loss/logits": 0.1538618952035904, "step": 9367 }, { "epoch": 0.1398845743211462, "grad_norm": 0.2470703125, "grad_norm_var": 0.0005916436513264974, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.6041109561920166, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.20361346006393433, "step": 9368 }, { "epoch": 0.13989950649176117, "grad_norm": 0.224609375, "grad_norm_var": 0.0006028493245442708, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.723627209663391, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.20294230431318283, "step": 9369 }, { "epoch": 0.13991443866237616, "grad_norm": 0.34765625, "grad_norm_var": 0.0012749354044596353, "learning_rate": 0.0001, "loss": 1.7753, "loss/crossentropy": 2.449383497238159, "loss/fcd": 1.4921875, "loss/idx": 10.5, "loss/logits": 0.28312036395072937, "step": 9370 }, { "epoch": 0.13992937083299115, "grad_norm": 0.458984375, "grad_norm_var": 0.003947559992472331, "learning_rate": 0.0001, "loss": 1.9903, "loss/crossentropy": 2.7601406574249268, "loss/fcd": 1.58203125, "loss/idx": 10.5, "loss/logits": 0.4082246646285057, "step": 9371 }, { "epoch": 0.1399443030036061, "grad_norm": 0.2236328125, "grad_norm_var": 0.003901402155558268, "learning_rate": 0.0001, "loss": 1.3865, "loss/crossentropy": 2.393273949623108, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19117314368486404, "step": 9372 }, { "epoch": 0.1399592351742211, "grad_norm": 0.2265625, "grad_norm_var": 0.0038577874501546225, "learning_rate": 0.0001, "loss": 1.2917, "loss/crossentropy": 2.7911499738693237, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1784394010901451, "step": 9373 }, { "epoch": 0.1399741673448361, "grad_norm": 0.291015625, "grad_norm_var": 0.0038747747739156088, "learning_rate": 0.0001, "loss": 1.5175, "loss/crossentropy": 2.684444785118103, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.22840169072151184, "step": 9374 }, { "epoch": 0.13998909951545108, "grad_norm": 0.275390625, "grad_norm_var": 0.0038527448972066245, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.6512210369110107, "loss/fcd": 1.28125, "loss/idx": 10.5, "loss/logits": 0.250586673617363, "step": 9375 }, { "epoch": 0.14000403168606604, "grad_norm": 0.271484375, "grad_norm_var": 0.003845242659250895, "learning_rate": 0.0001, "loss": 1.1701, "loss/crossentropy": 2.4975212812423706, "loss/fcd": 1.01171875, "loss/idx": 10.5, "loss/logits": 0.15838327258825302, "step": 9376 }, { "epoch": 0.14001896385668103, "grad_norm": 0.291015625, "grad_norm_var": 0.0035796483357747396, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.726920962333679, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.19467688351869583, "step": 9377 }, { "epoch": 0.14003389602729602, "grad_norm": 0.2255859375, "grad_norm_var": 0.0036574323972066243, "learning_rate": 0.0001, "loss": 1.335, "loss/crossentropy": 2.7143665552139282, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.19440902024507523, "step": 9378 }, { "epoch": 0.14004882819791098, "grad_norm": 0.259765625, "grad_norm_var": 0.0036647756894429523, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.46094012260437, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17925123125314713, "step": 9379 }, { "epoch": 0.14006376036852597, "grad_norm": 0.2578125, "grad_norm_var": 0.003517770767211914, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.5968598127365112, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20041067898273468, "step": 9380 }, { "epoch": 0.14007869253914096, "grad_norm": 0.201171875, "grad_norm_var": 0.003838205337524414, "learning_rate": 0.0001, "loss": 1.2561, "loss/crossentropy": 2.666003465652466, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17019468545913696, "step": 9381 }, { "epoch": 0.14009362470975592, "grad_norm": 0.24609375, "grad_norm_var": 0.0038709004720052084, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.5593801736831665, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2010243460536003, "step": 9382 }, { "epoch": 0.1401085568803709, "grad_norm": 0.212890625, "grad_norm_var": 0.003990646203358968, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.566577434539795, "loss/fcd": 1.03515625, "loss/idx": 10.5, "loss/logits": 0.1467982828617096, "step": 9383 }, { "epoch": 0.1401234890509859, "grad_norm": 0.220703125, "grad_norm_var": 0.004101689656575521, "learning_rate": 0.0001, "loss": 1.3424, "loss/crossentropy": 2.5115214586257935, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1939288154244423, "step": 9384 }, { "epoch": 0.1401384212216009, "grad_norm": 0.21875, "grad_norm_var": 0.00413511594136556, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.7243233919143677, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1969488263130188, "step": 9385 }, { "epoch": 0.14015335339221585, "grad_norm": 0.236328125, "grad_norm_var": 0.003672154744466146, "learning_rate": 0.0001, "loss": 1.4613, "loss/crossentropy": 2.4387080669403076, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.21134644746780396, "step": 9386 }, { "epoch": 0.14016828556283084, "grad_norm": 0.21875, "grad_norm_var": 0.0008197625478108724, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.639375925064087, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1810377836227417, "step": 9387 }, { "epoch": 0.14018321773344583, "grad_norm": 0.376953125, "grad_norm_var": 0.0019071539243062337, "learning_rate": 0.0001, "loss": 1.6395, "loss/crossentropy": 2.5497896671295166, "loss/fcd": 1.36328125, "loss/idx": 10.5, "loss/logits": 0.27620121091604233, "step": 9388 }, { "epoch": 0.1401981499040608, "grad_norm": 0.2080078125, "grad_norm_var": 0.0019913355509440104, "learning_rate": 0.0001, "loss": 1.3128, "loss/crossentropy": 2.560211420059204, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.18000486493110657, "step": 9389 }, { "epoch": 0.14021308207467578, "grad_norm": 0.263671875, "grad_norm_var": 0.0018911997477213541, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.5283323526382446, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.2138214036822319, "step": 9390 }, { "epoch": 0.14022801424529077, "grad_norm": 0.224609375, "grad_norm_var": 0.0018738428751627603, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.7384533882141113, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.20367469638586044, "step": 9391 }, { "epoch": 0.14024294641590576, "grad_norm": 0.244140625, "grad_norm_var": 0.001827112833658854, "learning_rate": 0.0001, "loss": 1.396, "loss/crossentropy": 2.700668215751648, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.21242806315422058, "step": 9392 }, { "epoch": 0.14025787858652072, "grad_norm": 0.265625, "grad_norm_var": 0.0017087141672770183, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.751060366630554, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.22259077429771423, "step": 9393 }, { "epoch": 0.1402728107571357, "grad_norm": 0.5078125, "grad_norm_var": 0.006048452854156494, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.357629418373108, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.21621837466955185, "step": 9394 }, { "epoch": 0.1402877429277507, "grad_norm": 0.25, "grad_norm_var": 0.006054969628651937, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.5503010749816895, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.20078598707914352, "step": 9395 }, { "epoch": 0.14030267509836566, "grad_norm": 0.251953125, "grad_norm_var": 0.00605849822362264, "learning_rate": 0.0001, "loss": 1.3372, "loss/crossentropy": 2.338261365890503, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18482866138219833, "step": 9396 }, { "epoch": 0.14031760726898065, "grad_norm": 0.212890625, "grad_norm_var": 0.005976386864980062, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.6595476865768433, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.17475764453411102, "step": 9397 }, { "epoch": 0.14033253943959564, "grad_norm": 0.232421875, "grad_norm_var": 0.006013325850168864, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.698323965072632, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19964487105607986, "step": 9398 }, { "epoch": 0.1403474716102106, "grad_norm": 0.2265625, "grad_norm_var": 0.005940783023834229, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.7429248094558716, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.22039157152175903, "step": 9399 }, { "epoch": 0.1403624037808256, "grad_norm": 0.26953125, "grad_norm_var": 0.0058342893918355305, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.707392930984497, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19066072255373, "step": 9400 }, { "epoch": 0.14037733595144058, "grad_norm": 0.2373046875, "grad_norm_var": 0.005746332804361979, "learning_rate": 0.0001, "loss": 1.2868, "loss/crossentropy": 2.4562325477600098, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.1696023941040039, "step": 9401 }, { "epoch": 0.14039226812205557, "grad_norm": 0.228515625, "grad_norm_var": 0.005779139200846354, "learning_rate": 0.0001, "loss": 1.2614, "loss/crossentropy": 2.492369055747986, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16370391845703125, "step": 9402 }, { "epoch": 0.14040720029267054, "grad_norm": 0.244140625, "grad_norm_var": 0.005667352676391601, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.5879374742507935, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.193653404712677, "step": 9403 }, { "epoch": 0.14042213246328553, "grad_norm": 0.39453125, "grad_norm_var": 0.005948448181152343, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.5362937450408936, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.15593283623456955, "step": 9404 }, { "epoch": 0.14043706463390052, "grad_norm": 0.2294921875, "grad_norm_var": 0.005810149510701497, "learning_rate": 0.0001, "loss": 1.2659, "loss/crossentropy": 2.1574243903160095, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.15650713443756104, "step": 9405 }, { "epoch": 0.14045199680451548, "grad_norm": 0.23828125, "grad_norm_var": 0.005864079793294271, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.5505348443984985, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1677914559841156, "step": 9406 }, { "epoch": 0.14046692897513047, "grad_norm": 0.244140625, "grad_norm_var": 0.005779838562011719, "learning_rate": 0.0001, "loss": 1.3742, "loss/crossentropy": 2.5422619581222534, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.2023647576570511, "step": 9407 }, { "epoch": 0.14048186114574546, "grad_norm": 0.244140625, "grad_norm_var": 0.005779838562011719, "learning_rate": 0.0001, "loss": 1.3276, "loss/crossentropy": 2.4123538732528687, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.17526912689208984, "step": 9408 }, { "epoch": 0.14049679331636045, "grad_norm": 0.232421875, "grad_norm_var": 0.00585630734761556, "learning_rate": 0.0001, "loss": 1.4194, "loss/crossentropy": 2.496854066848755, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.2123575434088707, "step": 9409 }, { "epoch": 0.1405117254869754, "grad_norm": 0.2255859375, "grad_norm_var": 0.0017072002092997232, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.6829068660736084, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18402884155511856, "step": 9410 }, { "epoch": 0.1405266576575904, "grad_norm": 0.3046875, "grad_norm_var": 0.001911477247873942, "learning_rate": 0.0001, "loss": 1.5447, "loss/crossentropy": 2.5750622749328613, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.22830751538276672, "step": 9411 }, { "epoch": 0.1405415898282054, "grad_norm": 0.263671875, "grad_norm_var": 0.0019214908281962077, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.6511389017105103, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18304108828306198, "step": 9412 }, { "epoch": 0.14055652199882035, "grad_norm": 0.2421875, "grad_norm_var": 0.001823262373606364, "learning_rate": 0.0001, "loss": 1.3035, "loss/crossentropy": 2.612966775894165, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1785156950354576, "step": 9413 }, { "epoch": 0.14057145416943534, "grad_norm": 0.25, "grad_norm_var": 0.0017929355303446452, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.677309989929199, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.24429336190223694, "step": 9414 }, { "epoch": 0.14058638634005033, "grad_norm": 0.26953125, "grad_norm_var": 0.001747127374013265, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.7122501134872437, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.21764836460351944, "step": 9415 }, { "epoch": 0.1406013185106653, "grad_norm": 0.2392578125, "grad_norm_var": 0.0017553806304931641, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.7039082050323486, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17011941969394684, "step": 9416 }, { "epoch": 0.14061625068128028, "grad_norm": 0.2119140625, "grad_norm_var": 0.0018572489420572917, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.5773316621780396, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1905713975429535, "step": 9417 }, { "epoch": 0.14063118285189527, "grad_norm": 0.2470703125, "grad_norm_var": 0.001815950870513916, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.75490140914917, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.17914298921823502, "step": 9418 }, { "epoch": 0.14064611502251026, "grad_norm": 0.228515625, "grad_norm_var": 0.0018539706865946452, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.614574909210205, "loss/fcd": 1.0234375, "loss/idx": 10.5, "loss/logits": 0.1450720727443695, "step": 9419 }, { "epoch": 0.14066104719312522, "grad_norm": 0.2255859375, "grad_norm_var": 0.0004742781321207682, "learning_rate": 0.0001, "loss": 1.2358, "loss/crossentropy": 2.5021690130233765, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1615699827671051, "step": 9420 }, { "epoch": 0.1406759793637402, "grad_norm": 0.23828125, "grad_norm_var": 0.00046265522638956705, "learning_rate": 0.0001, "loss": 1.2544, "loss/crossentropy": 2.6698139905929565, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.16065531224012375, "step": 9421 }, { "epoch": 0.1406909115343552, "grad_norm": 0.22265625, "grad_norm_var": 0.0004899938901265462, "learning_rate": 0.0001, "loss": 1.4585, "loss/crossentropy": 2.533944010734558, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.231968492269516, "step": 9422 }, { "epoch": 0.14070584370497016, "grad_norm": 0.251953125, "grad_norm_var": 0.0004948894182840983, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.7489761114120483, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1963423490524292, "step": 9423 }, { "epoch": 0.14072077587558515, "grad_norm": 0.287109375, "grad_norm_var": 0.0006134311358133951, "learning_rate": 0.0001, "loss": 1.2761, "loss/crossentropy": 3.0855202674865723, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.15892648696899414, "step": 9424 }, { "epoch": 0.14073570804620014, "grad_norm": 0.2578125, "grad_norm_var": 0.0006068189938863119, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.4348597526550293, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.21140915155410767, "step": 9425 }, { "epoch": 0.1407506402168151, "grad_norm": 0.23828125, "grad_norm_var": 0.0005791823069254558, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.4934498071670532, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1702190712094307, "step": 9426 }, { "epoch": 0.1407655723874301, "grad_norm": 0.2255859375, "grad_norm_var": 0.0003793040911356608, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.8044556379318237, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18084637820720673, "step": 9427 }, { "epoch": 0.14078050455804508, "grad_norm": 0.232421875, "grad_norm_var": 0.0003571788469950358, "learning_rate": 0.0001, "loss": 1.2763, "loss/crossentropy": 2.4803842306137085, "loss/fcd": 1.1015625, "loss/idx": 10.5, "loss/logits": 0.17474903166294098, "step": 9428 }, { "epoch": 0.14079543672866007, "grad_norm": 0.251953125, "grad_norm_var": 0.0003636956214904785, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.511532783508301, "loss/fcd": 1.36328125, "loss/idx": 10.5, "loss/logits": 0.24788732826709747, "step": 9429 }, { "epoch": 0.14081036889927503, "grad_norm": 0.2578125, "grad_norm_var": 0.0003754576047261556, "learning_rate": 0.0001, "loss": 1.4356, "loss/crossentropy": 2.705975651741028, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.21689876168966293, "step": 9430 }, { "epoch": 0.14082530106989002, "grad_norm": 0.220703125, "grad_norm_var": 0.00035082101821899415, "learning_rate": 0.0001, "loss": 1.2175, "loss/crossentropy": 2.4534956216812134, "loss/fcd": 1.05859375, "loss/idx": 10.5, "loss/logits": 0.158938467502594, "step": 9431 }, { "epoch": 0.14084023324050501, "grad_norm": 0.2470703125, "grad_norm_var": 0.0003540635108947754, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.6897482872009277, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.19074058532714844, "step": 9432 }, { "epoch": 0.14085516541111998, "grad_norm": 0.28515625, "grad_norm_var": 0.00041217803955078124, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.494102120399475, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.2015157863497734, "step": 9433 }, { "epoch": 0.14087009758173497, "grad_norm": 0.216796875, "grad_norm_var": 0.0004605889320373535, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.5970932245254517, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.19071338325738907, "step": 9434 }, { "epoch": 0.14088502975234996, "grad_norm": 0.44921875, "grad_norm_var": 0.0030792832374572756, "learning_rate": 0.0001, "loss": 2.5696, "loss/crossentropy": 2.514261245727539, "loss/fcd": 2.17578125, "loss/idx": 10.5, "loss/logits": 0.39386773109436035, "step": 9435 }, { "epoch": 0.14089996192296494, "grad_norm": 0.298828125, "grad_norm_var": 0.0031099796295166017, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.553678274154663, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20084989070892334, "step": 9436 }, { "epoch": 0.1409148940935799, "grad_norm": 0.251953125, "grad_norm_var": 0.0030796051025390623, "learning_rate": 0.0001, "loss": 1.4711, "loss/crossentropy": 2.4640175104141235, "loss/fcd": 1.23828125, "loss/idx": 10.5, "loss/logits": 0.2328178733587265, "step": 9437 }, { "epoch": 0.1409298262641949, "grad_norm": 0.2236328125, "grad_norm_var": 0.0030745148658752443, "learning_rate": 0.0001, "loss": 1.3035, "loss/crossentropy": 2.464409112930298, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1745523288846016, "step": 9438 }, { "epoch": 0.14094475843480989, "grad_norm": 0.2197265625, "grad_norm_var": 0.003183746337890625, "learning_rate": 0.0001, "loss": 1.3626, "loss/crossentropy": 2.62226939201355, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19465304166078568, "step": 9439 }, { "epoch": 0.14095969060542485, "grad_norm": 0.2080078125, "grad_norm_var": 0.0032915711402893067, "learning_rate": 0.0001, "loss": 1.3535, "loss/crossentropy": 2.7552565336227417, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.20503991097211838, "step": 9440 }, { "epoch": 0.14097462277603984, "grad_norm": 0.2578125, "grad_norm_var": 0.0032915711402893067, "learning_rate": 0.0001, "loss": 1.3582, "loss/crossentropy": 2.6072330474853516, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18242467194795609, "step": 9441 }, { "epoch": 0.14098955494665483, "grad_norm": 0.2158203125, "grad_norm_var": 0.0033740997314453125, "learning_rate": 0.0001, "loss": 1.305, "loss/crossentropy": 2.5617127418518066, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.17995470762252808, "step": 9442 }, { "epoch": 0.1410044871172698, "grad_norm": 0.2431640625, "grad_norm_var": 0.003327035903930664, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.8026100397109985, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21225174516439438, "step": 9443 }, { "epoch": 0.14101941928788478, "grad_norm": 0.2255859375, "grad_norm_var": 0.003350540002187093, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 2.5446507930755615, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.18741007149219513, "step": 9444 }, { "epoch": 0.14103435145849977, "grad_norm": 0.2412109375, "grad_norm_var": 0.00336151123046875, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.756899356842041, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18543770909309387, "step": 9445 }, { "epoch": 0.14104928362911476, "grad_norm": 0.26953125, "grad_norm_var": 0.003376197814941406, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.642755389213562, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.2028195485472679, "step": 9446 }, { "epoch": 0.14106421579972972, "grad_norm": 0.234375, "grad_norm_var": 0.0033260186513264975, "learning_rate": 0.0001, "loss": 1.3611, "loss/crossentropy": 2.3073419332504272, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18531298637390137, "step": 9447 }, { "epoch": 0.1410791479703447, "grad_norm": 0.2216796875, "grad_norm_var": 0.003394826253255208, "learning_rate": 0.0001, "loss": 1.4281, "loss/crossentropy": 2.611891984939575, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2327624261379242, "step": 9448 }, { "epoch": 0.1410940801409597, "grad_norm": 0.224609375, "grad_norm_var": 0.0033716678619384764, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.5995534658432007, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18478266149759293, "step": 9449 }, { "epoch": 0.14110901231157466, "grad_norm": 0.251953125, "grad_norm_var": 0.003292703628540039, "learning_rate": 0.0001, "loss": 1.5153, "loss/crossentropy": 2.11775803565979, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.20275165885686874, "step": 9450 }, { "epoch": 0.14112394448218965, "grad_norm": 0.234375, "grad_norm_var": 0.0005372206370035808, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.605180501937866, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.22231854498386383, "step": 9451 }, { "epoch": 0.14113887665280464, "grad_norm": 0.23046875, "grad_norm_var": 0.0002829869588216146, "learning_rate": 0.0001, "loss": 1.2279, "loss/crossentropy": 2.517146944999695, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.15758377313613892, "step": 9452 }, { "epoch": 0.14115380882341963, "grad_norm": 0.2421875, "grad_norm_var": 0.0002663771311442057, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.6586742401123047, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16032998263835907, "step": 9453 }, { "epoch": 0.1411687409940346, "grad_norm": 0.1982421875, "grad_norm_var": 0.000341796875, "learning_rate": 0.0001, "loss": 1.1578, "loss/crossentropy": 2.4689308404922485, "loss/fcd": 1.017578125, "loss/idx": 10.5, "loss/logits": 0.1402178481221199, "step": 9454 }, { "epoch": 0.14118367316464958, "grad_norm": 0.36328125, "grad_norm_var": 0.0013867974281311034, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.6437790393829346, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.19173185527324677, "step": 9455 }, { "epoch": 0.14119860533526457, "grad_norm": 0.2216796875, "grad_norm_var": 0.0013376196225484212, "learning_rate": 0.0001, "loss": 1.3359, "loss/crossentropy": 2.496284008026123, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.17574810981750488, "step": 9456 }, { "epoch": 0.14121353750587953, "grad_norm": 0.263671875, "grad_norm_var": 0.001351924737294515, "learning_rate": 0.0001, "loss": 1.4934, "loss/crossentropy": 2.206523895263672, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.20827506482601166, "step": 9457 }, { "epoch": 0.14122846967649452, "grad_norm": 0.251953125, "grad_norm_var": 0.0013044357299804687, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.426393151283264, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.20780686289072037, "step": 9458 }, { "epoch": 0.1412434018471095, "grad_norm": 0.228515625, "grad_norm_var": 0.0013211846351623535, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.4767647981643677, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.1905585080385208, "step": 9459 }, { "epoch": 0.14125833401772447, "grad_norm": 0.28125, "grad_norm_var": 0.001378488540649414, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.738950729370117, "loss/fcd": 1.38671875, "loss/idx": 10.5, "loss/logits": 0.2517043501138687, "step": 9460 }, { "epoch": 0.14127326618833946, "grad_norm": 0.28515625, "grad_norm_var": 0.001462709903717041, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.4969414472579956, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20543763041496277, "step": 9461 }, { "epoch": 0.14128819835895445, "grad_norm": 0.2158203125, "grad_norm_var": 0.0015044530232747396, "learning_rate": 0.0001, "loss": 1.3795, "loss/crossentropy": 2.6142162084579468, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.21542883664369583, "step": 9462 }, { "epoch": 0.14130313052956944, "grad_norm": 0.2490234375, "grad_norm_var": 0.001493545373280843, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.8047693967819214, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.17610513418912888, "step": 9463 }, { "epoch": 0.1413180627001844, "grad_norm": 0.265625, "grad_norm_var": 0.0014615376790364583, "learning_rate": 0.0001, "loss": 1.3546, "loss/crossentropy": 2.662292242050171, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.2062043845653534, "step": 9464 }, { "epoch": 0.1413329948707994, "grad_norm": 0.228515625, "grad_norm_var": 0.0014490127563476563, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.656335711479187, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.18620942533016205, "step": 9465 }, { "epoch": 0.14134792704141438, "grad_norm": 0.2333984375, "grad_norm_var": 0.0014675100644429525, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.668669819831848, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20610079914331436, "step": 9466 }, { "epoch": 0.14136285921202935, "grad_norm": 0.2392578125, "grad_norm_var": 0.0014591058095296224, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.787474513053894, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19819419085979462, "step": 9467 }, { "epoch": 0.14137779138264434, "grad_norm": 0.2490234375, "grad_norm_var": 0.0014326055844624838, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.2136403918266296, "loss/fcd": 1.24609375, "loss/idx": 10.5, "loss/logits": 0.22450627386569977, "step": 9468 }, { "epoch": 0.14139272355325933, "grad_norm": 0.287109375, "grad_norm_var": 0.0015057206153869628, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.419519782066345, "loss/fcd": 1.296875, "loss/idx": 10.5, "loss/logits": 0.21485332399606705, "step": 9469 }, { "epoch": 0.14140765572387431, "grad_norm": 0.2451171875, "grad_norm_var": 0.0012955307960510255, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.6398861408233643, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.17967846989631653, "step": 9470 }, { "epoch": 0.14142258789448928, "grad_norm": 0.2373046875, "grad_norm_var": 0.0004984378814697265, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.6310596466064453, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17608866840600967, "step": 9471 }, { "epoch": 0.14143752006510427, "grad_norm": 0.2734375, "grad_norm_var": 0.0004780093828837077, "learning_rate": 0.0001, "loss": 1.5342, "loss/crossentropy": 2.2033525705337524, "loss/fcd": 1.32421875, "loss/idx": 10.5, "loss/logits": 0.20994798839092255, "step": 9472 }, { "epoch": 0.14145245223571926, "grad_norm": 0.232421875, "grad_norm_var": 0.0004909793535868327, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.409693956375122, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21573715656995773, "step": 9473 }, { "epoch": 0.14146738440633422, "grad_norm": 0.2314453125, "grad_norm_var": 0.0005124251047770183, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.529270052909851, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18240444362163544, "step": 9474 }, { "epoch": 0.1414823165769492, "grad_norm": 0.279296875, "grad_norm_var": 0.0005355676015218099, "learning_rate": 0.0001, "loss": 1.4512, "loss/crossentropy": 2.5743097066879272, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.21678229421377182, "step": 9475 }, { "epoch": 0.1414972487475642, "grad_norm": 0.259765625, "grad_norm_var": 0.00048084259033203124, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.6613821983337402, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21120084822177887, "step": 9476 }, { "epoch": 0.14151218091817916, "grad_norm": 0.228515625, "grad_norm_var": 0.0004213809967041016, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.652271032333374, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.2263631597161293, "step": 9477 }, { "epoch": 0.14152711308879415, "grad_norm": 0.2099609375, "grad_norm_var": 0.00044803619384765626, "learning_rate": 0.0001, "loss": 1.402, "loss/crossentropy": 2.5242390632629395, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.22624558955430984, "step": 9478 }, { "epoch": 0.14154204525940914, "grad_norm": 0.255859375, "grad_norm_var": 0.0004529595375061035, "learning_rate": 0.0001, "loss": 1.5171, "loss/crossentropy": 2.4442696571350098, "loss/fcd": 1.2890625, "loss/idx": 10.5, "loss/logits": 0.22801907360553741, "step": 9479 }, { "epoch": 0.14155697743002413, "grad_norm": 0.232421875, "grad_norm_var": 0.00044052998224894206, "learning_rate": 0.0001, "loss": 1.1364, "loss/crossentropy": 2.4154821634292603, "loss/fcd": 0.99609375, "loss/idx": 10.5, "loss/logits": 0.1402997449040413, "step": 9480 }, { "epoch": 0.1415719096006391, "grad_norm": 0.2421875, "grad_norm_var": 0.00042183796564737954, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 2.4912497997283936, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1934981346130371, "step": 9481 }, { "epoch": 0.14158684177125408, "grad_norm": 0.255859375, "grad_norm_var": 0.0004155317942301432, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.5575389862060547, "loss/fcd": 1.25, "loss/idx": 10.5, "loss/logits": 0.21213823556900024, "step": 9482 }, { "epoch": 0.14160177394186907, "grad_norm": 0.2373046875, "grad_norm_var": 0.00041790008544921874, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.630358099937439, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17954210937023163, "step": 9483 }, { "epoch": 0.14161670611248403, "grad_norm": 0.228515625, "grad_norm_var": 0.0004395127296447754, "learning_rate": 0.0001, "loss": 1.343, "loss/crossentropy": 2.3170522451400757, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.19068024307489395, "step": 9484 }, { "epoch": 0.14163163828309902, "grad_norm": 0.2177734375, "grad_norm_var": 0.00036023457845052085, "learning_rate": 0.0001, "loss": 1.2744, "loss/crossentropy": 2.586926221847534, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.16896453499794006, "step": 9485 }, { "epoch": 0.141646570453714, "grad_norm": 0.2490234375, "grad_norm_var": 0.00036296844482421877, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.5510811805725098, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21445664763450623, "step": 9486 }, { "epoch": 0.14166150262432897, "grad_norm": 0.2275390625, "grad_norm_var": 0.0003749688466389974, "learning_rate": 0.0001, "loss": 1.2382, "loss/crossentropy": 2.443634271621704, "loss/fcd": 1.0703125, "loss/idx": 10.5, "loss/logits": 0.16786273568868637, "step": 9487 }, { "epoch": 0.14167643479494396, "grad_norm": 0.232421875, "grad_norm_var": 0.0003045399983723958, "learning_rate": 0.0001, "loss": 1.3488, "loss/crossentropy": 2.4148645401000977, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.17693816125392914, "step": 9488 }, { "epoch": 0.14169136696555895, "grad_norm": 0.228515625, "grad_norm_var": 0.00030879974365234376, "learning_rate": 0.0001, "loss": 1.3333, "loss/crossentropy": 2.476832866668701, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18488913774490356, "step": 9489 }, { "epoch": 0.14170629913617394, "grad_norm": 0.2392578125, "grad_norm_var": 0.0003052393595377604, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.7207541465759277, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.16540253907442093, "step": 9490 }, { "epoch": 0.1417212313067889, "grad_norm": 0.23046875, "grad_norm_var": 0.00019199053446451823, "learning_rate": 0.0001, "loss": 1.2745, "loss/crossentropy": 2.75784170627594, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.18469326198101044, "step": 9491 }, { "epoch": 0.1417361634774039, "grad_norm": 0.255859375, "grad_norm_var": 0.00018054644266764323, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.540335536003113, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.22602489590644836, "step": 9492 }, { "epoch": 0.14175109564801888, "grad_norm": 0.2412109375, "grad_norm_var": 0.00017842849095662435, "learning_rate": 0.0001, "loss": 1.326, "loss/crossentropy": 2.801883578300476, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.20492719858884811, "step": 9493 }, { "epoch": 0.14176602781863384, "grad_norm": 0.2109375, "grad_norm_var": 0.00017503102620442708, "learning_rate": 0.0001, "loss": 1.2603, "loss/crossentropy": 2.749627709388733, "loss/fcd": 1.0859375, "loss/idx": 10.5, "loss/logits": 0.17432521283626556, "step": 9494 }, { "epoch": 0.14178095998924883, "grad_norm": 0.19921875, "grad_norm_var": 0.00022988319396972657, "learning_rate": 0.0001, "loss": 1.2689, "loss/crossentropy": 2.4340391159057617, "loss/fcd": 1.095703125, "loss/idx": 10.5, "loss/logits": 0.1732359379529953, "step": 9495 }, { "epoch": 0.14179589215986382, "grad_norm": 0.240234375, "grad_norm_var": 0.0002330621083577474, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.8425474166870117, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19736189395189285, "step": 9496 }, { "epoch": 0.1418108243304788, "grad_norm": 0.251953125, "grad_norm_var": 0.0002503077189127604, "learning_rate": 0.0001, "loss": 1.2966, "loss/crossentropy": 2.690892219543457, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18727389723062515, "step": 9497 }, { "epoch": 0.14182575650109377, "grad_norm": 0.2080078125, "grad_norm_var": 0.0002547860145568848, "learning_rate": 0.0001, "loss": 1.2481, "loss/crossentropy": 2.3670352697372437, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.16611211001873016, "step": 9498 }, { "epoch": 0.14184068867170876, "grad_norm": 0.251953125, "grad_norm_var": 0.0002802371978759766, "learning_rate": 0.0001, "loss": 1.3827, "loss/crossentropy": 2.4021599292755127, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.1991332322359085, "step": 9499 }, { "epoch": 0.14185562084232375, "grad_norm": 0.2431640625, "grad_norm_var": 0.00028673410415649416, "learning_rate": 0.0001, "loss": 1.3688, "loss/crossentropy": 2.641085982322693, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.18131285905838013, "step": 9500 }, { "epoch": 0.14187055301293872, "grad_norm": 0.244140625, "grad_norm_var": 0.00027675628662109374, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.3524203300476074, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.19354989379644394, "step": 9501 }, { "epoch": 0.1418854851835537, "grad_norm": 0.240234375, "grad_norm_var": 0.00026470422744750977, "learning_rate": 0.0001, "loss": 1.3561, "loss/crossentropy": 2.6170164346694946, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.19980476796627045, "step": 9502 }, { "epoch": 0.1419004173541687, "grad_norm": 0.259765625, "grad_norm_var": 0.00030155181884765624, "learning_rate": 0.0001, "loss": 1.4196, "loss/crossentropy": 2.641465902328491, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20081645250320435, "step": 9503 }, { "epoch": 0.14191534952478366, "grad_norm": 0.2294921875, "grad_norm_var": 0.00030351877212524413, "learning_rate": 0.0001, "loss": 1.3471, "loss/crossentropy": 2.638666868209839, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.1986771672964096, "step": 9504 }, { "epoch": 0.14193028169539865, "grad_norm": 0.251953125, "grad_norm_var": 0.0003147721290588379, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.648338198661804, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18037045747041702, "step": 9505 }, { "epoch": 0.14194521386601364, "grad_norm": 0.2451171875, "grad_norm_var": 0.00031839609146118165, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.333516240119934, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.25508134812116623, "step": 9506 }, { "epoch": 0.14196014603662863, "grad_norm": 0.2275390625, "grad_norm_var": 0.00032176971435546873, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.75629723072052, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18682356178760529, "step": 9507 }, { "epoch": 0.1419750782072436, "grad_norm": 0.255859375, "grad_norm_var": 0.00032176971435546873, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.4802573919296265, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16777631640434265, "step": 9508 }, { "epoch": 0.14199001037785858, "grad_norm": 0.2392578125, "grad_norm_var": 0.00032105445861816404, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.578150749206543, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.19006043672561646, "step": 9509 }, { "epoch": 0.14200494254847357, "grad_norm": 0.20703125, "grad_norm_var": 0.00033580462137858074, "learning_rate": 0.0001, "loss": 1.1742, "loss/crossentropy": 2.5055129528045654, "loss/fcd": 1.01953125, "loss/idx": 10.5, "loss/logits": 0.15469244867563248, "step": 9510 }, { "epoch": 0.14201987471908853, "grad_norm": 0.2275390625, "grad_norm_var": 0.00024257898330688478, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.669649839401245, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18084163963794708, "step": 9511 }, { "epoch": 0.14203480688970352, "grad_norm": 0.2412109375, "grad_norm_var": 0.00024280548095703124, "learning_rate": 0.0001, "loss": 1.2518, "loss/crossentropy": 2.6492021083831787, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.1736922487616539, "step": 9512 }, { "epoch": 0.1420497390603185, "grad_norm": 0.3671875, "grad_norm_var": 0.0012715498606363932, "learning_rate": 0.0001, "loss": 1.764, "loss/crossentropy": 2.6834166049957275, "loss/fcd": 1.41796875, "loss/idx": 10.5, "loss/logits": 0.3460075259208679, "step": 9513 }, { "epoch": 0.1420646712309335, "grad_norm": 0.2099609375, "grad_norm_var": 0.0012618382771809897, "learning_rate": 0.0001, "loss": 1.3261, "loss/crossentropy": 2.6825989484786987, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18545284867286682, "step": 9514 }, { "epoch": 0.14207960340154846, "grad_norm": 0.306640625, "grad_norm_var": 0.0014897028605143229, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 3.0303122997283936, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.21208463609218597, "step": 9515 }, { "epoch": 0.14209453557216345, "grad_norm": 0.244140625, "grad_norm_var": 0.0014889041582743328, "learning_rate": 0.0001, "loss": 1.4303, "loss/crossentropy": 2.600090980529785, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21541644632816315, "step": 9516 }, { "epoch": 0.14210946774277844, "grad_norm": 0.33203125, "grad_norm_var": 0.0019051829973856609, "learning_rate": 0.0001, "loss": 1.7563, "loss/crossentropy": 2.4699429273605347, "loss/fcd": 1.515625, "loss/idx": 10.5, "loss/logits": 0.24063080549240112, "step": 9517 }, { "epoch": 0.1421243999133934, "grad_norm": 0.2177734375, "grad_norm_var": 0.0019818623860677082, "learning_rate": 0.0001, "loss": 1.3182, "loss/crossentropy": 2.5734331607818604, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.18930354714393616, "step": 9518 }, { "epoch": 0.1421393320840084, "grad_norm": 0.2470703125, "grad_norm_var": 0.001982017358144124, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.916799545288086, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.22184907644987106, "step": 9519 }, { "epoch": 0.14215426425462338, "grad_norm": 0.2412109375, "grad_norm_var": 0.0019536932309468587, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.561159372329712, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19681213051080704, "step": 9520 }, { "epoch": 0.14216919642523834, "grad_norm": 0.259765625, "grad_norm_var": 0.001955537001291911, "learning_rate": 0.0001, "loss": 1.452, "loss/crossentropy": 2.4724671840667725, "loss/fcd": 1.23046875, "loss/idx": 10.5, "loss/logits": 0.22152268141508102, "step": 9521 }, { "epoch": 0.14218412859585333, "grad_norm": 0.2197265625, "grad_norm_var": 0.0020270307858784994, "learning_rate": 0.0001, "loss": 1.4184, "loss/crossentropy": 2.600496292114258, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.21523286402225494, "step": 9522 }, { "epoch": 0.14219906076646832, "grad_norm": 0.2490234375, "grad_norm_var": 0.001983670393625895, "learning_rate": 0.0001, "loss": 1.3513, "loss/crossentropy": 2.667130947113037, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.19118157029151917, "step": 9523 }, { "epoch": 0.1422139929370833, "grad_norm": 0.2431640625, "grad_norm_var": 0.0019907474517822264, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 2.467320442199707, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.16952424496412277, "step": 9524 }, { "epoch": 0.14222892510769827, "grad_norm": 0.234375, "grad_norm_var": 0.002001376946767171, "learning_rate": 0.0001, "loss": 1.3973, "loss/crossentropy": 2.476413369178772, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.2097841575741768, "step": 9525 }, { "epoch": 0.14224385727831326, "grad_norm": 0.2421875, "grad_norm_var": 0.0018631895383199057, "learning_rate": 0.0001, "loss": 1.189, "loss/crossentropy": 2.2993338108062744, "loss/fcd": 1.04296875, "loss/idx": 10.5, "loss/logits": 0.14602231979370117, "step": 9526 }, { "epoch": 0.14225878944892825, "grad_norm": 0.2216796875, "grad_norm_var": 0.0018869360287984212, "learning_rate": 0.0001, "loss": 1.33, "loss/crossentropy": 2.608041286468506, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.1971973031759262, "step": 9527 }, { "epoch": 0.14227372161954321, "grad_norm": 0.294921875, "grad_norm_var": 0.001969766616821289, "learning_rate": 0.0001, "loss": 1.3611, "loss/crossentropy": 2.8979045152664185, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.2009485885500908, "step": 9528 }, { "epoch": 0.1422886537901582, "grad_norm": 0.251953125, "grad_norm_var": 0.0011248270670572917, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.5761327743530273, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21549153327941895, "step": 9529 }, { "epoch": 0.1423035859607732, "grad_norm": 0.2294921875, "grad_norm_var": 0.001041857401529948, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.574631929397583, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.20185262709856033, "step": 9530 }, { "epoch": 0.14231851813138818, "grad_norm": 0.2353515625, "grad_norm_var": 0.0008419950803120931, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.589313268661499, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.17560040205717087, "step": 9531 }, { "epoch": 0.14233345030200314, "grad_norm": 0.259765625, "grad_norm_var": 0.000849751631418864, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.6551365852355957, "loss/fcd": 1.0546875, "loss/idx": 10.5, "loss/logits": 0.16815199702978134, "step": 9532 }, { "epoch": 0.14234838247261813, "grad_norm": 0.22265625, "grad_norm_var": 0.0003824512163798014, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.443766713142395, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18859852105379105, "step": 9533 }, { "epoch": 0.14236331464323312, "grad_norm": 0.2353515625, "grad_norm_var": 0.00034525791803995766, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.723590612411499, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.1957627460360527, "step": 9534 }, { "epoch": 0.14237824681384809, "grad_norm": 0.240234375, "grad_norm_var": 0.0003444512685139974, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.65397572517395, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.1925511509180069, "step": 9535 }, { "epoch": 0.14239317898446308, "grad_norm": 0.2373046875, "grad_norm_var": 0.00034610430399576825, "learning_rate": 0.0001, "loss": 1.3926, "loss/crossentropy": 2.634247899055481, "loss/fcd": 1.1796875, "loss/idx": 10.5, "loss/logits": 0.2129097655415535, "step": 9536 }, { "epoch": 0.14240811115507807, "grad_norm": 0.265625, "grad_norm_var": 0.00036188761393229164, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.5312000513076782, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.2250586450099945, "step": 9537 }, { "epoch": 0.14242304332569303, "grad_norm": 0.2109375, "grad_norm_var": 0.0003936092058817546, "learning_rate": 0.0001, "loss": 1.2983, "loss/crossentropy": 2.3568174839019775, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.18501794338226318, "step": 9538 }, { "epoch": 0.14243797549630802, "grad_norm": 0.28125, "grad_norm_var": 0.0004881540934244792, "learning_rate": 0.0001, "loss": 1.5368, "loss/crossentropy": 2.5599586963653564, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.2242748737335205, "step": 9539 }, { "epoch": 0.142452907666923, "grad_norm": 0.2294921875, "grad_norm_var": 0.0005016167958577474, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.5141289234161377, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.17806809395551682, "step": 9540 }, { "epoch": 0.142467839837538, "grad_norm": 0.21875, "grad_norm_var": 0.0005354404449462891, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.619695782661438, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.1772092804312706, "step": 9541 }, { "epoch": 0.14248277200815296, "grad_norm": 0.20703125, "grad_norm_var": 0.000613260269165039, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.5969960689544678, "loss/fcd": 1.0625, "loss/idx": 10.5, "loss/logits": 0.17364151030778885, "step": 9542 }, { "epoch": 0.14249770417876795, "grad_norm": 0.212890625, "grad_norm_var": 0.0006396889686584473, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.682297706604004, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.17723242193460464, "step": 9543 }, { "epoch": 0.14251263634938294, "grad_norm": 0.2353515625, "grad_norm_var": 0.00042177836100260415, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.5837109088897705, "loss/fcd": 1.19140625, "loss/idx": 10.5, "loss/logits": 0.20315717905759811, "step": 9544 }, { "epoch": 0.1425275685199979, "grad_norm": 0.224609375, "grad_norm_var": 0.00040976206461588543, "learning_rate": 0.0001, "loss": 1.352, "loss/crossentropy": 2.711501717567444, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.2075061947107315, "step": 9545 }, { "epoch": 0.1425425006906129, "grad_norm": 0.2236328125, "grad_norm_var": 0.0004155317942301432, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.5791587829589844, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.19170262664556503, "step": 9546 }, { "epoch": 0.14255743286122788, "grad_norm": 0.19921875, "grad_norm_var": 0.0004894852638244629, "learning_rate": 0.0001, "loss": 1.2315, "loss/crossentropy": 2.6207245588302612, "loss/fcd": 1.06640625, "loss/idx": 10.5, "loss/logits": 0.16511517763137817, "step": 9547 }, { "epoch": 0.14257236503184284, "grad_norm": 0.2451171875, "grad_norm_var": 0.00044770240783691405, "learning_rate": 0.0001, "loss": 1.3715, "loss/crossentropy": 2.5013657808303833, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.2113156095147133, "step": 9548 }, { "epoch": 0.14258729720245783, "grad_norm": 0.236328125, "grad_norm_var": 0.00044492085774739586, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.535917639732361, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.20538964122533798, "step": 9549 }, { "epoch": 0.14260222937307282, "grad_norm": 0.2197265625, "grad_norm_var": 0.0004520416259765625, "learning_rate": 0.0001, "loss": 1.3401, "loss/crossentropy": 2.5852701663970947, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.19167238473892212, "step": 9550 }, { "epoch": 0.1426171615436878, "grad_norm": 0.2119140625, "grad_norm_var": 0.00046529372533162433, "learning_rate": 0.0001, "loss": 1.2922, "loss/crossentropy": 2.2867552042007446, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.16718215495347977, "step": 9551 }, { "epoch": 0.14263209371430277, "grad_norm": 0.291015625, "grad_norm_var": 0.0007072289784749349, "learning_rate": 0.0001, "loss": 1.5064, "loss/crossentropy": 2.789198398590088, "loss/fcd": 1.2734375, "loss/idx": 10.5, "loss/logits": 0.2329530119895935, "step": 9552 }, { "epoch": 0.14264702588491776, "grad_norm": 0.2265625, "grad_norm_var": 0.0006277561187744141, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.635051965713501, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17616962641477585, "step": 9553 }, { "epoch": 0.14266195805553275, "grad_norm": 0.2392578125, "grad_norm_var": 0.0006073594093322753, "learning_rate": 0.0001, "loss": 1.2076, "loss/crossentropy": 2.7618653774261475, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.16076769679784775, "step": 9554 }, { "epoch": 0.1426768902261477, "grad_norm": 0.2412109375, "grad_norm_var": 0.0004413445790608724, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.477773666381836, "loss/fcd": 1.234375, "loss/idx": 10.5, "loss/logits": 0.21610725671052933, "step": 9555 }, { "epoch": 0.1426918223967627, "grad_norm": 0.232421875, "grad_norm_var": 0.0004421194394429525, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.345028281211853, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21242009848356247, "step": 9556 }, { "epoch": 0.1427067545673777, "grad_norm": 0.2451171875, "grad_norm_var": 0.0004493077596028646, "learning_rate": 0.0001, "loss": 1.4273, "loss/crossentropy": 2.5956499576568604, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21241265535354614, "step": 9557 }, { "epoch": 0.14272168673799268, "grad_norm": 0.251953125, "grad_norm_var": 0.00043358802795410154, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.4021657705307007, "loss/fcd": 1.162109375, "loss/idx": 10.5, "loss/logits": 0.18045827746391296, "step": 9558 }, { "epoch": 0.14273661890860764, "grad_norm": 0.2236328125, "grad_norm_var": 0.0004112521807352702, "learning_rate": 0.0001, "loss": 1.283, "loss/crossentropy": 2.6764330863952637, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18921378254890442, "step": 9559 }, { "epoch": 0.14275155107922263, "grad_norm": 0.26953125, "grad_norm_var": 0.0004895528157552084, "learning_rate": 0.0001, "loss": 1.5768, "loss/crossentropy": 2.455868363380432, "loss/fcd": 1.34375, "loss/idx": 10.5, "loss/logits": 0.23305535316467285, "step": 9560 }, { "epoch": 0.14276648324983762, "grad_norm": 0.42578125, "grad_norm_var": 0.0027046044667561847, "learning_rate": 0.0001, "loss": 1.4861, "loss/crossentropy": 3.166740298271179, "loss/fcd": 1.265625, "loss/idx": 10.5, "loss/logits": 0.22050121426582336, "step": 9561 }, { "epoch": 0.14278141542045258, "grad_norm": 0.33984375, "grad_norm_var": 0.0031571348508199057, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.524213194847107, "loss/fcd": 1.30859375, "loss/idx": 10.5, "loss/logits": 0.20649836957454681, "step": 9562 }, { "epoch": 0.14279634759106757, "grad_norm": 0.271484375, "grad_norm_var": 0.002934833367665609, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.490659713745117, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.19995036721229553, "step": 9563 }, { "epoch": 0.14281127976168256, "grad_norm": 0.220703125, "grad_norm_var": 0.00302275021870931, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.6950485706329346, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.17254573106765747, "step": 9564 }, { "epoch": 0.14282621193229753, "grad_norm": 0.2578125, "grad_norm_var": 0.0029862085978190106, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.6649078130722046, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1783491149544716, "step": 9565 }, { "epoch": 0.14284114410291252, "grad_norm": 0.24609375, "grad_norm_var": 0.002886323134104411, "learning_rate": 0.0001, "loss": 1.4061, "loss/crossentropy": 2.9004961252212524, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21075671166181564, "step": 9566 }, { "epoch": 0.1428560762735275, "grad_norm": 0.216796875, "grad_norm_var": 0.0028551101684570314, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.564993143081665, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1855943650007248, "step": 9567 }, { "epoch": 0.1428710084441425, "grad_norm": 0.275390625, "grad_norm_var": 0.0028108596801757813, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.521596670150757, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.24693815410137177, "step": 9568 }, { "epoch": 0.14288594061475746, "grad_norm": 0.2216796875, "grad_norm_var": 0.0028350790341695148, "learning_rate": 0.0001, "loss": 1.2773, "loss/crossentropy": 2.6312389373779297, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.18354643881320953, "step": 9569 }, { "epoch": 0.14290087278537245, "grad_norm": 0.2353515625, "grad_norm_var": 0.002847445011138916, "learning_rate": 0.0001, "loss": 1.3016, "loss/crossentropy": 2.537177801132202, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.18053628504276276, "step": 9570 }, { "epoch": 0.14291580495598744, "grad_norm": 0.2265625, "grad_norm_var": 0.002899360656738281, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.513461709022522, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.21051091700792313, "step": 9571 }, { "epoch": 0.1429307371266024, "grad_norm": 0.263671875, "grad_norm_var": 0.0028454462687174478, "learning_rate": 0.0001, "loss": 1.3197, "loss/crossentropy": 2.769260883331299, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1907544657588005, "step": 9572 }, { "epoch": 0.1429456692972174, "grad_norm": 0.205078125, "grad_norm_var": 0.003035573164621989, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.5212053060531616, "loss/fcd": 1.08203125, "loss/idx": 10.5, "loss/logits": 0.17277374863624573, "step": 9573 }, { "epoch": 0.14296060146783238, "grad_norm": 0.24609375, "grad_norm_var": 0.0030435840288798015, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.419508218765259, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.20308740437030792, "step": 9574 }, { "epoch": 0.14297553363844737, "grad_norm": 0.2158203125, "grad_norm_var": 0.0030843377113342284, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.875875473022461, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.18183889985084534, "step": 9575 }, { "epoch": 0.14299046580906233, "grad_norm": 0.2197265625, "grad_norm_var": 0.003166818618774414, "learning_rate": 0.0001, "loss": 1.3361, "loss/crossentropy": 2.3297070264816284, "loss/fcd": 1.15625, "loss/idx": 10.5, "loss/logits": 0.17986807227134705, "step": 9576 }, { "epoch": 0.14300539797967732, "grad_norm": 0.208984375, "grad_norm_var": 0.0011819839477539063, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.379167675971985, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19324533641338348, "step": 9577 }, { "epoch": 0.1430203301502923, "grad_norm": 0.2734375, "grad_norm_var": 0.0005907694498697917, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.481409430503845, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.20624887943267822, "step": 9578 }, { "epoch": 0.14303526232090727, "grad_norm": 0.240234375, "grad_norm_var": 0.0005114237467447917, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.531321167945862, "loss/fcd": 1.171875, "loss/idx": 10.5, "loss/logits": 0.18288728594779968, "step": 9579 }, { "epoch": 0.14305019449152226, "grad_norm": 0.236328125, "grad_norm_var": 0.000495147705078125, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.536004662513733, "loss/fcd": 1.25390625, "loss/idx": 10.5, "loss/logits": 0.23048307746648788, "step": 9580 }, { "epoch": 0.14306512666213725, "grad_norm": 0.220703125, "grad_norm_var": 0.0004773298899332682, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.5771456956863403, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19995328038930893, "step": 9581 }, { "epoch": 0.1430800588327522, "grad_norm": 0.2294921875, "grad_norm_var": 0.0004688858985900879, "learning_rate": 0.0001, "loss": 1.4114, "loss/crossentropy": 2.403106689453125, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.20438498258590698, "step": 9582 }, { "epoch": 0.1430949910033672, "grad_norm": 0.228515625, "grad_norm_var": 0.0004514336585998535, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.628024697303772, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.19280283898115158, "step": 9583 }, { "epoch": 0.1431099231739822, "grad_norm": 0.224609375, "grad_norm_var": 0.00033365488052368165, "learning_rate": 0.0001, "loss": 1.3836, "loss/crossentropy": 2.658261775970459, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.2078547328710556, "step": 9584 }, { "epoch": 0.14312485534459718, "grad_norm": 0.3359375, "grad_norm_var": 0.0010073184967041016, "learning_rate": 0.0001, "loss": 1.6714, "loss/crossentropy": 2.5025991201400757, "loss/fcd": 1.4296875, "loss/idx": 10.5, "loss/logits": 0.24172402918338776, "step": 9585 }, { "epoch": 0.14313978751521214, "grad_norm": 0.306640625, "grad_norm_var": 0.0012982646624247233, "learning_rate": 0.0001, "loss": 1.6733, "loss/crossentropy": 2.466047525405884, "loss/fcd": 1.3828125, "loss/idx": 10.5, "loss/logits": 0.29052910953760147, "step": 9586 }, { "epoch": 0.14315471968582713, "grad_norm": 0.2333984375, "grad_norm_var": 0.0012865543365478515, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.789056181907654, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.176891528069973, "step": 9587 }, { "epoch": 0.14316965185644212, "grad_norm": 0.263671875, "grad_norm_var": 0.0012865543365478515, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.4861525297164917, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.1910889372229576, "step": 9588 }, { "epoch": 0.14318458402705708, "grad_norm": 0.212890625, "grad_norm_var": 0.0012508233388264973, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.4027822017669678, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.17533846199512482, "step": 9589 }, { "epoch": 0.14319951619767207, "grad_norm": 0.2294921875, "grad_norm_var": 0.0012623747189839682, "learning_rate": 0.0001, "loss": 1.3385, "loss/crossentropy": 2.282052755355835, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.17836138606071472, "step": 9590 }, { "epoch": 0.14321444836828706, "grad_norm": 0.283203125, "grad_norm_var": 0.0013065179189046224, "learning_rate": 0.0001, "loss": 1.3892, "loss/crossentropy": 2.3483868837356567, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.1782495453953743, "step": 9591 }, { "epoch": 0.14322938053890205, "grad_norm": 0.251953125, "grad_norm_var": 0.0012555082639058432, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.328999400138855, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20583898574113846, "step": 9592 }, { "epoch": 0.143244312709517, "grad_norm": 0.265625, "grad_norm_var": 0.0011559446652730306, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.6347965002059937, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.21066586673259735, "step": 9593 }, { "epoch": 0.143259244880132, "grad_norm": 0.251953125, "grad_norm_var": 0.0011241237322489421, "learning_rate": 0.0001, "loss": 1.3734, "loss/crossentropy": 2.35828959941864, "loss/fcd": 1.18359375, "loss/idx": 10.5, "loss/logits": 0.189833864569664, "step": 9594 }, { "epoch": 0.143274177050747, "grad_norm": 0.26171875, "grad_norm_var": 0.0011223753293355305, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6525524854660034, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19653792679309845, "step": 9595 }, { "epoch": 0.14328910922136195, "grad_norm": 0.244140625, "grad_norm_var": 0.001109596093495687, "learning_rate": 0.0001, "loss": 1.3746, "loss/crossentropy": 2.598668932914734, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.1988023966550827, "step": 9596 }, { "epoch": 0.14330404139197694, "grad_norm": 0.25390625, "grad_norm_var": 0.0010366400082906088, "learning_rate": 0.0001, "loss": 1.3845, "loss/crossentropy": 2.898369789123535, "loss/fcd": 1.1875, "loss/idx": 10.5, "loss/logits": 0.19697092473506927, "step": 9597 }, { "epoch": 0.14331897356259193, "grad_norm": 0.396484375, "grad_norm_var": 0.00221556027730306, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.5250481367111206, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.2193853035569191, "step": 9598 }, { "epoch": 0.1433339057332069, "grad_norm": 0.27734375, "grad_norm_var": 0.0021253585815429687, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.7119396924972534, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19737648218870163, "step": 9599 }, { "epoch": 0.14334883790382189, "grad_norm": 0.2373046875, "grad_norm_var": 0.002061458428700765, "learning_rate": 0.0001, "loss": 1.28, "loss/crossentropy": 2.610126495361328, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1862017661333084, "step": 9600 }, { "epoch": 0.14336377007443687, "grad_norm": 0.314453125, "grad_norm_var": 0.0018988569577534993, "learning_rate": 0.0001, "loss": 1.4488, "loss/crossentropy": 2.653404474258423, "loss/fcd": 1.21875, "loss/idx": 10.5, "loss/logits": 0.2300441861152649, "step": 9601 }, { "epoch": 0.14337870224505186, "grad_norm": 0.236328125, "grad_norm_var": 0.0018433531125386555, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.7937612533569336, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.15349411964416504, "step": 9602 }, { "epoch": 0.14339363441566683, "grad_norm": 0.2734375, "grad_norm_var": 0.0017835617065429688, "learning_rate": 0.0001, "loss": 1.408, "loss/crossentropy": 2.7663121223449707, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.2126934453845024, "step": 9603 }, { "epoch": 0.14340856658628182, "grad_norm": 0.2890625, "grad_norm_var": 0.0018164157867431641, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.530999541282654, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.22604025900363922, "step": 9604 }, { "epoch": 0.1434234987568968, "grad_norm": 0.224609375, "grad_norm_var": 0.0017397403717041016, "learning_rate": 0.0001, "loss": 1.2619, "loss/crossentropy": 2.6640273332595825, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.16420253366231918, "step": 9605 }, { "epoch": 0.14343843092751177, "grad_norm": 0.2373046875, "grad_norm_var": 0.0017032464345296224, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.520464301109314, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.19195334613323212, "step": 9606 }, { "epoch": 0.14345336309812676, "grad_norm": 0.224609375, "grad_norm_var": 0.001804335912068685, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.5270144939422607, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.19662123173475266, "step": 9607 }, { "epoch": 0.14346829526874175, "grad_norm": 0.2314453125, "grad_norm_var": 0.0018663366635640463, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.627149224281311, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17874958366155624, "step": 9608 }, { "epoch": 0.1434832274393567, "grad_norm": 0.248046875, "grad_norm_var": 0.0018812139828999837, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.8046164512634277, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.21408799290657043, "step": 9609 }, { "epoch": 0.1434981596099717, "grad_norm": 0.228515625, "grad_norm_var": 0.0019489248593648274, "learning_rate": 0.0001, "loss": 1.319, "loss/crossentropy": 2.4848955869674683, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.19395452737808228, "step": 9610 }, { "epoch": 0.1435130917805867, "grad_norm": 0.2109375, "grad_norm_var": 0.0021063764890034992, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.638764977455139, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.20869697630405426, "step": 9611 }, { "epoch": 0.14352802395120168, "grad_norm": 0.255859375, "grad_norm_var": 0.0020933111508687338, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.4336291551589966, "loss/fcd": 1.26953125, "loss/idx": 10.5, "loss/logits": 0.2537868916988373, "step": 9612 }, { "epoch": 0.14354295612181664, "grad_norm": 0.236328125, "grad_norm_var": 0.0021239240964253743, "learning_rate": 0.0001, "loss": 1.4496, "loss/crossentropy": 2.601696014404297, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.22304361313581467, "step": 9613 }, { "epoch": 0.14355788829243163, "grad_norm": 0.2392578125, "grad_norm_var": 0.0007580439249674479, "learning_rate": 0.0001, "loss": 1.3316, "loss/crossentropy": 2.5769598484039307, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.19491096585988998, "step": 9614 }, { "epoch": 0.14357282046304662, "grad_norm": 0.2373046875, "grad_norm_var": 0.000700533390045166, "learning_rate": 0.0001, "loss": 1.2856, "loss/crossentropy": 2.7341305017471313, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.18008841574192047, "step": 9615 }, { "epoch": 0.14358775263366158, "grad_norm": 0.2333984375, "grad_norm_var": 0.0007056514422098796, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.242018461227417, "loss/fcd": 1.20703125, "loss/idx": 10.5, "loss/logits": 0.21858787536621094, "step": 9616 }, { "epoch": 0.14360268480427657, "grad_norm": 0.251953125, "grad_norm_var": 0.00037148396174112953, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.4339762926101685, "loss/fcd": 1.3125, "loss/idx": 10.5, "loss/logits": 0.23862942308187485, "step": 9617 }, { "epoch": 0.14361761697489156, "grad_norm": 0.2275390625, "grad_norm_var": 0.0003819624582926432, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.725418448448181, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.17518261075019836, "step": 9618 }, { "epoch": 0.14363254914550655, "grad_norm": 0.21484375, "grad_norm_var": 0.0003400007883707682, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.62307608127594, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.1886458471417427, "step": 9619 }, { "epoch": 0.1436474813161215, "grad_norm": 0.2177734375, "grad_norm_var": 0.0001621842384338379, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 2.5482760667800903, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.19486015290021896, "step": 9620 }, { "epoch": 0.1436624134867365, "grad_norm": 0.32421875, "grad_norm_var": 0.0006777405738830567, "learning_rate": 0.0001, "loss": 1.2998, "loss/crossentropy": 2.7143940925598145, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18261627852916718, "step": 9621 }, { "epoch": 0.1436773456573515, "grad_norm": 0.2109375, "grad_norm_var": 0.0007261276245117187, "learning_rate": 0.0001, "loss": 1.2785, "loss/crossentropy": 2.46293842792511, "loss/fcd": 1.09375, "loss/idx": 10.5, "loss/logits": 0.1847839504480362, "step": 9622 }, { "epoch": 0.14369227782796645, "grad_norm": 0.224609375, "grad_norm_var": 0.0007261276245117187, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.984370231628418, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20152338594198227, "step": 9623 }, { "epoch": 0.14370720999858144, "grad_norm": 0.259765625, "grad_norm_var": 0.0007550517717997233, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.7266710996627808, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.20911084115505219, "step": 9624 }, { "epoch": 0.14372214216919643, "grad_norm": 0.26171875, "grad_norm_var": 0.0007835348447163899, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 2.90245258808136, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.1774844080209732, "step": 9625 }, { "epoch": 0.1437370743398114, "grad_norm": 0.2265625, "grad_norm_var": 0.0007866819699605306, "learning_rate": 0.0001, "loss": 1.3243, "loss/crossentropy": 2.611708164215088, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1954285278916359, "step": 9626 }, { "epoch": 0.14375200651042638, "grad_norm": 0.28515625, "grad_norm_var": 0.0008476853370666503, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.557468056678772, "loss/fcd": 1.28515625, "loss/idx": 10.5, "loss/logits": 0.22225894033908844, "step": 9627 }, { "epoch": 0.14376693868104137, "grad_norm": 0.271484375, "grad_norm_var": 0.0008872310320536295, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.5033414363861084, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.19943827390670776, "step": 9628 }, { "epoch": 0.14378187085165636, "grad_norm": 0.220703125, "grad_norm_var": 0.0009209275245666504, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.448018431663513, "loss/fcd": 1.08984375, "loss/idx": 10.5, "loss/logits": 0.17669199407100677, "step": 9629 }, { "epoch": 0.14379680302227132, "grad_norm": 0.2490234375, "grad_norm_var": 0.0009204506874084473, "learning_rate": 0.0001, "loss": 1.5542, "loss/crossentropy": 2.542046308517456, "loss/fcd": 1.3046875, "loss/idx": 10.5, "loss/logits": 0.24952909350395203, "step": 9630 }, { "epoch": 0.14381173519288631, "grad_norm": 0.205078125, "grad_norm_var": 0.001017618179321289, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.614695191383362, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18469354510307312, "step": 9631 }, { "epoch": 0.1438266673635013, "grad_norm": 0.248046875, "grad_norm_var": 0.0010126709938049316, "learning_rate": 0.0001, "loss": 1.3214, "loss/crossentropy": 2.586373209953308, "loss/fcd": 1.13671875, "loss/idx": 10.5, "loss/logits": 0.18463242799043655, "step": 9632 }, { "epoch": 0.14384159953411627, "grad_norm": 0.244140625, "grad_norm_var": 0.0010079026222229003, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.5436878204345703, "loss/fcd": 1.203125, "loss/idx": 10.5, "loss/logits": 0.20140201598405838, "step": 9633 }, { "epoch": 0.14385653170473126, "grad_norm": 0.279296875, "grad_norm_var": 0.0010670820871988933, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.59735643863678, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1615482121706009, "step": 9634 }, { "epoch": 0.14387146387534624, "grad_norm": 0.2490234375, "grad_norm_var": 0.0009960134824117024, "learning_rate": 0.0001, "loss": 1.4303, "loss/crossentropy": 2.789368152618408, "loss/fcd": 1.22265625, "loss/idx": 10.5, "loss/logits": 0.20766711235046387, "step": 9635 }, { "epoch": 0.14388639604596123, "grad_norm": 0.2470703125, "grad_norm_var": 0.000929256280263265, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.5415146350860596, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.19621910899877548, "step": 9636 }, { "epoch": 0.1439013282165762, "grad_norm": 0.2490234375, "grad_norm_var": 0.0005428155263264973, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.5342458486557007, "loss/fcd": 1.125, "loss/idx": 10.5, "loss/logits": 0.18320249766111374, "step": 9637 }, { "epoch": 0.14391626038719119, "grad_norm": 0.28125, "grad_norm_var": 0.0005256493886311848, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.425041675567627, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.20052678138017654, "step": 9638 }, { "epoch": 0.14393119255780618, "grad_norm": 0.234375, "grad_norm_var": 0.0004983901977539063, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.6971548795700073, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.2440975308418274, "step": 9639 }, { "epoch": 0.14394612472842114, "grad_norm": 0.2138671875, "grad_norm_var": 0.0005747755368550618, "learning_rate": 0.0001, "loss": 1.3204, "loss/crossentropy": 2.543662667274475, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.187566876411438, "step": 9640 }, { "epoch": 0.14396105689903613, "grad_norm": 0.2578125, "grad_norm_var": 0.0005685130755106609, "learning_rate": 0.0001, "loss": 1.3585, "loss/crossentropy": 2.6026976108551025, "loss/fcd": 1.1640625, "loss/idx": 10.5, "loss/logits": 0.1944158747792244, "step": 9641 }, { "epoch": 0.14397598906965112, "grad_norm": 0.296875, "grad_norm_var": 0.0006800929705301921, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.486689567565918, "loss/fcd": 1.31640625, "loss/idx": 10.5, "loss/logits": 0.21432828158140182, "step": 9642 }, { "epoch": 0.14399092124026608, "grad_norm": 0.2314453125, "grad_norm_var": 0.0006230513254801433, "learning_rate": 0.0001, "loss": 1.3759, "loss/crossentropy": 2.440722703933716, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.20009034126996994, "step": 9643 }, { "epoch": 0.14400585341088107, "grad_norm": 0.2216796875, "grad_norm_var": 0.0006264964739481608, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.712130308151245, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.16945812106132507, "step": 9644 }, { "epoch": 0.14402078558149606, "grad_norm": 0.2109375, "grad_norm_var": 0.0006648023923238118, "learning_rate": 0.0001, "loss": 1.2897, "loss/crossentropy": 2.5176109075546265, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.1920890361070633, "step": 9645 }, { "epoch": 0.14403571775211105, "grad_norm": 0.244140625, "grad_norm_var": 0.0006636301676432292, "learning_rate": 0.0001, "loss": 1.3242, "loss/crossentropy": 2.569314479827881, "loss/fcd": 1.14453125, "loss/idx": 10.5, "loss/logits": 0.17963898926973343, "step": 9646 }, { "epoch": 0.144050649922726, "grad_norm": 0.2734375, "grad_norm_var": 0.0005952040354410807, "learning_rate": 0.0001, "loss": 1.3199, "loss/crossentropy": 2.8077449798583984, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.17924223095178604, "step": 9647 }, { "epoch": 0.144065582093341, "grad_norm": 0.267578125, "grad_norm_var": 0.0006168206532796224, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.3933682441711426, "loss/fcd": 1.2109375, "loss/idx": 10.5, "loss/logits": 0.20660236477851868, "step": 9648 }, { "epoch": 0.144080514263956, "grad_norm": 0.271484375, "grad_norm_var": 0.0006417433420817057, "learning_rate": 0.0001, "loss": 1.3626, "loss/crossentropy": 2.8001677989959717, "loss/fcd": 1.16015625, "loss/idx": 10.5, "loss/logits": 0.20240898430347443, "step": 9649 }, { "epoch": 0.14409544643457095, "grad_norm": 0.24609375, "grad_norm_var": 0.0005890528361002604, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.839108109474182, "loss/fcd": 1.15234375, "loss/idx": 10.5, "loss/logits": 0.18877114355564117, "step": 9650 }, { "epoch": 0.14411037860518594, "grad_norm": 0.232421875, "grad_norm_var": 0.0006078998247782389, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.6594998836517334, "loss/fcd": 1.11328125, "loss/idx": 10.5, "loss/logits": 0.1752975732088089, "step": 9651 }, { "epoch": 0.14412531077580093, "grad_norm": 0.2373046875, "grad_norm_var": 0.0006160060564676921, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.4621118307113647, "loss/fcd": 1.21484375, "loss/idx": 10.5, "loss/logits": 0.21072688698768616, "step": 9652 }, { "epoch": 0.14414024294641592, "grad_norm": 0.234375, "grad_norm_var": 0.0006276289621988932, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.4106231927871704, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.17699838429689407, "step": 9653 }, { "epoch": 0.14415517511703088, "grad_norm": 0.26953125, "grad_norm_var": 0.0005829970041910808, "learning_rate": 0.0001, "loss": 1.3081, "loss/crossentropy": 2.846924304962158, "loss/fcd": 1.109375, "loss/idx": 10.5, "loss/logits": 0.19869375228881836, "step": 9654 }, { "epoch": 0.14417010728764587, "grad_norm": 0.267578125, "grad_norm_var": 0.0005983988444010417, "learning_rate": 0.0001, "loss": 1.4464, "loss/crossentropy": 2.452194571495056, "loss/fcd": 1.2421875, "loss/idx": 10.5, "loss/logits": 0.2041865587234497, "step": 9655 }, { "epoch": 0.14418503945826086, "grad_norm": 0.201171875, "grad_norm_var": 0.0006671547889709473, "learning_rate": 0.0001, "loss": 1.301, "loss/crossentropy": 2.5349377393722534, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.18385622650384903, "step": 9656 }, { "epoch": 0.14419997162887582, "grad_norm": 0.2392578125, "grad_norm_var": 0.00066375732421875, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 1.9326728582382202, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.1747988611459732, "step": 9657 }, { "epoch": 0.1442149037994908, "grad_norm": 0.208984375, "grad_norm_var": 0.0005571842193603516, "learning_rate": 0.0001, "loss": 1.445, "loss/crossentropy": 2.569476366043091, "loss/fcd": 1.2265625, "loss/idx": 10.5, "loss/logits": 0.21842770278453827, "step": 9658 }, { "epoch": 0.1442298359701058, "grad_norm": 0.2216796875, "grad_norm_var": 0.0005757013956705729, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.5217394828796387, "loss/fcd": 1.1953125, "loss/idx": 10.5, "loss/logits": 0.19882404804229736, "step": 9659 }, { "epoch": 0.14424476814072076, "grad_norm": 0.2353515625, "grad_norm_var": 0.0005531152089436849, "learning_rate": 0.0001, "loss": 1.2555, "loss/crossentropy": 2.5202648639678955, "loss/fcd": 1.09765625, "loss/idx": 10.5, "loss/logits": 0.15784182399511337, "step": 9660 }, { "epoch": 0.14425970031133575, "grad_norm": 0.2197265625, "grad_norm_var": 0.0005223234494527181, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.470387578010559, "loss/fcd": 1.1171875, "loss/idx": 10.5, "loss/logits": 0.17142455279827118, "step": 9661 }, { "epoch": 0.14427463248195074, "grad_norm": 0.2119140625, "grad_norm_var": 0.0005775292714436849, "learning_rate": 0.0001, "loss": 1.3382, "loss/crossentropy": 2.5689321756362915, "loss/fcd": 1.1484375, "loss/idx": 10.5, "loss/logits": 0.18980871886014938, "step": 9662 }, { "epoch": 0.14428956465256573, "grad_norm": 0.2490234375, "grad_norm_var": 0.0005055069923400879, "learning_rate": 0.0001, "loss": 1.2403, "loss/crossentropy": 2.75722599029541, "loss/fcd": 1.07421875, "loss/idx": 10.5, "loss/logits": 0.1660478413105011, "step": 9663 }, { "epoch": 0.1443044968231807, "grad_norm": 0.216796875, "grad_norm_var": 0.00046872695287068685, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.4731229543685913, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.1770351231098175, "step": 9664 }, { "epoch": 0.14431942899379568, "grad_norm": 0.208984375, "grad_norm_var": 0.00041023492813110354, "learning_rate": 0.0001, "loss": 1.2118, "loss/crossentropy": 2.6424769163131714, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.16496965289115906, "step": 9665 }, { "epoch": 0.14433436116441067, "grad_norm": 0.2353515625, "grad_norm_var": 0.00039620399475097655, "learning_rate": 0.0001, "loss": 1.2975, "loss/crossentropy": 2.4721102714538574, "loss/fcd": 1.1328125, "loss/idx": 10.5, "loss/logits": 0.1647222638130188, "step": 9666 }, { "epoch": 0.14434929333502564, "grad_norm": 0.28515625, "grad_norm_var": 0.0005828857421875, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.625073552131653, "loss/fcd": 1.17578125, "loss/idx": 10.5, "loss/logits": 0.18106835335493088, "step": 9667 }, { "epoch": 0.14436422550564063, "grad_norm": 0.2119140625, "grad_norm_var": 0.0006116072336832683, "learning_rate": 0.0001, "loss": 1.2723, "loss/crossentropy": 2.539343476295471, "loss/fcd": 1.10546875, "loss/idx": 10.5, "loss/logits": 0.1668248474597931, "step": 9668 }, { "epoch": 0.14437915767625562, "grad_norm": 0.25390625, "grad_norm_var": 0.0006408532460530599, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.2832337617874146, "loss/fcd": 1.26171875, "loss/idx": 10.5, "loss/logits": 0.18935878574848175, "step": 9669 }, { "epoch": 0.14439408984687058, "grad_norm": 0.212890625, "grad_norm_var": 0.0005694071451822917, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.5088655948638916, "loss/fcd": 1.12109375, "loss/idx": 10.5, "loss/logits": 0.2103649228811264, "step": 9670 }, { "epoch": 0.14440902201748557, "grad_norm": 0.2138671875, "grad_norm_var": 0.00048045714696248374, "learning_rate": 0.0001, "loss": 1.3106, "loss/crossentropy": 2.4905169010162354, "loss/fcd": 1.12890625, "loss/idx": 10.5, "loss/logits": 0.1816909909248352, "step": 9671 }, { "epoch": 0.14442395418810056, "grad_norm": 0.21875, "grad_norm_var": 0.00044011672337849935, "learning_rate": 0.0001, "loss": 1.2148, "loss/crossentropy": 2.695306181907654, "loss/fcd": 1.046875, "loss/idx": 10.5, "loss/logits": 0.16796646267175674, "step": 9672 }, { "epoch": 0.14443888635871555, "grad_norm": 0.240234375, "grad_norm_var": 0.0004416783650716146, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.5622178316116333, "loss/fcd": 1.140625, "loss/idx": 10.5, "loss/logits": 0.18739333748817444, "step": 9673 }, { "epoch": 0.1444538185293305, "grad_norm": 0.2578125, "grad_norm_var": 0.0004683017730712891, "learning_rate": 0.0001, "loss": 1.3534, "loss/crossentropy": 2.490492105484009, "loss/fcd": 1.16796875, "loss/idx": 10.5, "loss/logits": 0.18546582758426666, "step": 9674 }, { "epoch": 0.1444687506999455, "grad_norm": 0.21484375, "grad_norm_var": 0.0004795670509338379, "learning_rate": 0.0001, "loss": 1.2574, "loss/crossentropy": 2.746322274208069, "loss/fcd": 1.078125, "loss/idx": 10.5, "loss/logits": 0.179249569773674, "step": 9675 }, { "epoch": 0.1444836828705605, "grad_norm": 0.349609375, "grad_norm_var": 0.0013708114624023438, "learning_rate": 0.0001, "loss": 1.4398, "loss/crossentropy": 2.4169344902038574, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19373080134391785, "step": 9676 }, { "epoch": 0.14449861504117545, "grad_norm": 0.439453125, "grad_norm_var": 0.0038661599159240724, "learning_rate": 0.0001, "loss": 1.5749, "loss/crossentropy": 2.6741456985473633, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23506059497594833, "step": 9677 }, { "epoch": 0.14451354721179044, "grad_norm": 0.466796875, "grad_norm_var": 0.006588602066040039, "learning_rate": 0.0001, "loss": 1.5009, "loss/crossentropy": 2.6554417610168457, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2118253856897354, "step": 9678 }, { "epoch": 0.14452847938240543, "grad_norm": 0.380859375, "grad_norm_var": 0.0073551774024963375, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.5073477029800415, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19395192712545395, "step": 9679 }, { "epoch": 0.14454341155302042, "grad_norm": 0.419921875, "grad_norm_var": 0.008345345656077066, "learning_rate": 0.0001, "loss": 1.5835, "loss/crossentropy": 2.5214834213256836, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.2241588532924652, "step": 9680 }, { "epoch": 0.14455834372363538, "grad_norm": 0.361328125, "grad_norm_var": 0.008187894026438396, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.5127495527267456, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18107592314481735, "step": 9681 }, { "epoch": 0.14457327589425037, "grad_norm": 0.396484375, "grad_norm_var": 0.008471790949503582, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.774734616279602, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2272064983844757, "step": 9682 }, { "epoch": 0.14458820806486536, "grad_norm": 0.345703125, "grad_norm_var": 0.008518600463867187, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.525889277458191, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19750560820102692, "step": 9683 }, { "epoch": 0.14460314023548032, "grad_norm": 0.341796875, "grad_norm_var": 0.007847940921783448, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.366658568382263, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.2658788412809372, "step": 9684 }, { "epoch": 0.1446180724060953, "grad_norm": 0.51953125, "grad_norm_var": 0.00992962121963501, "learning_rate": 0.0001, "loss": 1.8013, "loss/crossentropy": 2.2177120447158813, "loss/fcd": 1.5703125, "loss/idx": 11.0, "loss/logits": 0.23100589215755463, "step": 9685 }, { "epoch": 0.1446330045767103, "grad_norm": 0.3671875, "grad_norm_var": 0.00887988011042277, "learning_rate": 0.0001, "loss": 1.5381, "loss/crossentropy": 2.6360291242599487, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2255638837814331, "step": 9686 }, { "epoch": 0.14464793674732526, "grad_norm": 0.341796875, "grad_norm_var": 0.0076508680979410805, "learning_rate": 0.0001, "loss": 1.3897, "loss/crossentropy": 2.725042700767517, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18652701377868652, "step": 9687 }, { "epoch": 0.14466286891794025, "grad_norm": 0.3828125, "grad_norm_var": 0.0063771406809488935, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.6178070306777954, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.1976310908794403, "step": 9688 }, { "epoch": 0.14467780108855524, "grad_norm": 0.38671875, "grad_norm_var": 0.005298296610514323, "learning_rate": 0.0001, "loss": 1.5763, "loss/crossentropy": 2.404675841331482, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23641996085643768, "step": 9689 }, { "epoch": 0.14469273325917023, "grad_norm": 0.353515625, "grad_norm_var": 0.004397185643513998, "learning_rate": 0.0001, "loss": 1.4075, "loss/crossentropy": 2.7484984397888184, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.1848173812031746, "step": 9690 }, { "epoch": 0.1447076654297852, "grad_norm": 0.337890625, "grad_norm_var": 0.0026458104451497397, "learning_rate": 0.0001, "loss": 1.4209, "loss/crossentropy": 2.5883688926696777, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19047021865844727, "step": 9691 }, { "epoch": 0.14472259760040018, "grad_norm": 0.31640625, "grad_norm_var": 0.0028800805409749347, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.6191368103027344, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20780649036169052, "step": 9692 }, { "epoch": 0.14473752977101517, "grad_norm": 0.345703125, "grad_norm_var": 0.002747329076131185, "learning_rate": 0.0001, "loss": 1.552, "loss/crossentropy": 2.7708863019943237, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.21606529504060745, "step": 9693 }, { "epoch": 0.14475246194163013, "grad_norm": 0.310546875, "grad_norm_var": 0.0024446964263916014, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.5857187509536743, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19805237650871277, "step": 9694 }, { "epoch": 0.14476739411224512, "grad_norm": 0.361328125, "grad_norm_var": 0.00243833859761556, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.48714280128479, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20754172652959824, "step": 9695 }, { "epoch": 0.1447823262828601, "grad_norm": 0.298828125, "grad_norm_var": 0.0025171756744384764, "learning_rate": 0.0001, "loss": 1.4184, "loss/crossentropy": 2.677434206008911, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.1957305371761322, "step": 9696 }, { "epoch": 0.1447972584534751, "grad_norm": 0.310546875, "grad_norm_var": 0.0026725610097249348, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 2.5453230142593384, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21259988844394684, "step": 9697 }, { "epoch": 0.14481219062409006, "grad_norm": 0.64453125, "grad_norm_var": 0.007813962300618489, "learning_rate": 0.0001, "loss": 1.6753, "loss/crossentropy": 2.6270681619644165, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.23001042753458023, "step": 9698 }, { "epoch": 0.14482712279470505, "grad_norm": 0.322265625, "grad_norm_var": 0.00793298085530599, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.3306902647018433, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.23387249559164047, "step": 9699 }, { "epoch": 0.14484205496532004, "grad_norm": 0.310546875, "grad_norm_var": 0.008117103576660156, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.4086174964904785, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2168828397989273, "step": 9700 }, { "epoch": 0.144856987135935, "grad_norm": 0.291015625, "grad_norm_var": 0.006806039810180664, "learning_rate": 0.0001, "loss": 1.5162, "loss/crossentropy": 2.5781325101852417, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23103384673595428, "step": 9701 }, { "epoch": 0.14487191930655, "grad_norm": 0.345703125, "grad_norm_var": 0.006800270080566407, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.4659076929092407, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.19292669743299484, "step": 9702 }, { "epoch": 0.14488685147716499, "grad_norm": 0.3203125, "grad_norm_var": 0.00686338742574056, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.5400941371917725, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21952545642852783, "step": 9703 }, { "epoch": 0.14490178364777995, "grad_norm": 0.30078125, "grad_norm_var": 0.0069515069325764975, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.7543174028396606, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21134328097105026, "step": 9704 }, { "epoch": 0.14491671581839494, "grad_norm": 0.3046875, "grad_norm_var": 0.006940825780232748, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.5467281341552734, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18919190019369125, "step": 9705 }, { "epoch": 0.14493164798900993, "grad_norm": 0.33984375, "grad_norm_var": 0.006931813557942709, "learning_rate": 0.0001, "loss": 1.5352, "loss/crossentropy": 2.6903231143951416, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.21101857721805573, "step": 9706 }, { "epoch": 0.14494658015962492, "grad_norm": 0.310546875, "grad_norm_var": 0.006991004943847657, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.6587226390838623, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21503892540931702, "step": 9707 }, { "epoch": 0.14496151233023988, "grad_norm": 0.333984375, "grad_norm_var": 0.006955957412719727, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 2.5425397157669067, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22132470458745956, "step": 9708 }, { "epoch": 0.14497644450085487, "grad_norm": 0.349609375, "grad_norm_var": 0.00695951779683431, "learning_rate": 0.0001, "loss": 1.5618, "loss/crossentropy": 2.5868500471115112, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22977043688297272, "step": 9709 }, { "epoch": 0.14499137667146986, "grad_norm": 0.30859375, "grad_norm_var": 0.006967671712239583, "learning_rate": 0.0001, "loss": 1.4545, "loss/crossentropy": 2.6332318782806396, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21230117976665497, "step": 9710 }, { "epoch": 0.14500630884208482, "grad_norm": 0.337890625, "grad_norm_var": 0.006937917073567708, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.5521944761276245, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20940809696912766, "step": 9711 }, { "epoch": 0.1450212410126998, "grad_norm": 0.37890625, "grad_norm_var": 0.006905984878540039, "learning_rate": 0.0001, "loss": 1.7148, "loss/crossentropy": 2.438349962234497, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.29295025765895844, "step": 9712 }, { "epoch": 0.1450361731833148, "grad_norm": 0.314453125, "grad_norm_var": 0.00688932736714681, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.3172056674957275, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18792209774255753, "step": 9713 }, { "epoch": 0.1450511053539298, "grad_norm": 0.365234375, "grad_norm_var": 0.0005956013997395833, "learning_rate": 0.0001, "loss": 1.5745, "loss/crossentropy": 2.382473111152649, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.21906301379203796, "step": 9714 }, { "epoch": 0.14506603752454475, "grad_norm": 0.322265625, "grad_norm_var": 0.0005956013997395833, "learning_rate": 0.0001, "loss": 1.4559, "loss/crossentropy": 2.6780471801757812, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19809741526842117, "step": 9715 }, { "epoch": 0.14508096969515974, "grad_norm": 0.294921875, "grad_norm_var": 0.00064544677734375, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.6586958169937134, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19657938927412033, "step": 9716 }, { "epoch": 0.14509590186577473, "grad_norm": 0.296875, "grad_norm_var": 0.0006201267242431641, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.643516421318054, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21400786191225052, "step": 9717 }, { "epoch": 0.1451108340363897, "grad_norm": 0.3515625, "grad_norm_var": 0.0006372451782226563, "learning_rate": 0.0001, "loss": 1.5709, "loss/crossentropy": 2.4701870679855347, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.22323815524578094, "step": 9718 }, { "epoch": 0.14512576620700468, "grad_norm": 0.326171875, "grad_norm_var": 0.0006342411041259765, "learning_rate": 0.0001, "loss": 1.5354, "loss/crossentropy": 2.75900137424469, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2228904366493225, "step": 9719 }, { "epoch": 0.14514069837761967, "grad_norm": 0.34765625, "grad_norm_var": 0.0006060123443603516, "learning_rate": 0.0001, "loss": 1.5131, "loss/crossentropy": 2.523534893989563, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22011751681566238, "step": 9720 }, { "epoch": 0.14515563054823463, "grad_norm": 0.333984375, "grad_norm_var": 0.00055999755859375, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.593127131462097, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2152590975165367, "step": 9721 }, { "epoch": 0.14517056271884962, "grad_norm": 0.298828125, "grad_norm_var": 0.000622415542602539, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.777364730834961, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.23467165976762772, "step": 9722 }, { "epoch": 0.1451854948894646, "grad_norm": 0.609375, "grad_norm_var": 0.005449676513671875, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.5918357372283936, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2582193985581398, "step": 9723 }, { "epoch": 0.1452004270600796, "grad_norm": 0.349609375, "grad_norm_var": 0.005435434977213541, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.514988660812378, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19535168260335922, "step": 9724 }, { "epoch": 0.14521535923069456, "grad_norm": 0.34375, "grad_norm_var": 0.005437199274698893, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.4790440797805786, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20616630464792252, "step": 9725 }, { "epoch": 0.14523029140130955, "grad_norm": 0.333984375, "grad_norm_var": 0.005341529846191406, "learning_rate": 0.0001, "loss": 1.4571, "loss/crossentropy": 2.579779863357544, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19927139580249786, "step": 9726 }, { "epoch": 0.14524522357192454, "grad_norm": 0.337890625, "grad_norm_var": 0.005341529846191406, "learning_rate": 0.0001, "loss": 1.5491, "loss/crossentropy": 2.7049397230148315, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23272642493247986, "step": 9727 }, { "epoch": 0.1452601557425395, "grad_norm": 0.36328125, "grad_norm_var": 0.005297279357910157, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.5979764461517334, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.17909057438373566, "step": 9728 }, { "epoch": 0.1452750879131545, "grad_norm": 0.34375, "grad_norm_var": 0.005214548110961914, "learning_rate": 0.0001, "loss": 1.5076, "loss/crossentropy": 2.5496336221694946, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21856272220611572, "step": 9729 }, { "epoch": 0.14529002008376948, "grad_norm": 0.365234375, "grad_norm_var": 0.005214548110961914, "learning_rate": 0.0001, "loss": 1.4713, "loss/crossentropy": 2.55440616607666, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20566771179437637, "step": 9730 }, { "epoch": 0.14530495225438445, "grad_norm": 0.28515625, "grad_norm_var": 0.005443763732910156, "learning_rate": 0.0001, "loss": 1.2492, "loss/crossentropy": 2.477937936782837, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.14767039567232132, "step": 9731 }, { "epoch": 0.14531988442499943, "grad_norm": 0.283203125, "grad_norm_var": 0.005536651611328125, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.474961996078491, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1847212091088295, "step": 9732 }, { "epoch": 0.14533481659561442, "grad_norm": 0.46875, "grad_norm_var": 0.006208038330078125, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.7014377117156982, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.227894127368927, "step": 9733 }, { "epoch": 0.14534974876622941, "grad_norm": 0.384765625, "grad_norm_var": 0.006244516372680664, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.211951494216919, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.23295682668685913, "step": 9734 }, { "epoch": 0.14536468093684438, "grad_norm": 0.345703125, "grad_norm_var": 0.006177759170532227, "learning_rate": 0.0001, "loss": 1.6424, "loss/crossentropy": 2.3903249502182007, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2596157118678093, "step": 9735 }, { "epoch": 0.14537961310745937, "grad_norm": 0.3125, "grad_norm_var": 0.006323099136352539, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.6581748723983765, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18710396438837051, "step": 9736 }, { "epoch": 0.14539454527807436, "grad_norm": 0.349609375, "grad_norm_var": 0.006284189224243164, "learning_rate": 0.0001, "loss": 1.6364, "loss/crossentropy": 2.600591778755188, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.25362491607666016, "step": 9737 }, { "epoch": 0.14540947744868932, "grad_norm": 0.3671875, "grad_norm_var": 0.006009928385416667, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.6068592071533203, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1871315985918045, "step": 9738 }, { "epoch": 0.1454244096193043, "grad_norm": 0.3203125, "grad_norm_var": 0.001822662353515625, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.5315182209014893, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18994104862213135, "step": 9739 }, { "epoch": 0.1454393417899193, "grad_norm": 0.353515625, "grad_norm_var": 0.0018248875935872396, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.36371648311615, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.23667418211698532, "step": 9740 }, { "epoch": 0.14545427396053429, "grad_norm": 0.380859375, "grad_norm_var": 0.0018928368886311849, "learning_rate": 0.0001, "loss": 1.5795, "loss/crossentropy": 2.38541579246521, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.22406043857336044, "step": 9741 }, { "epoch": 0.14546920613114925, "grad_norm": 0.30078125, "grad_norm_var": 0.0020314534505208332, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.7253013849258423, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2032717987895012, "step": 9742 }, { "epoch": 0.14548413830176424, "grad_norm": 0.31640625, "grad_norm_var": 0.0020882765452067058, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.4843355417251587, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2069850116968155, "step": 9743 }, { "epoch": 0.14549907047237923, "grad_norm": 0.30078125, "grad_norm_var": 0.002191019058227539, "learning_rate": 0.0001, "loss": 1.4347, "loss/crossentropy": 2.6679478883743286, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2003471702337265, "step": 9744 }, { "epoch": 0.1455140026429942, "grad_norm": 0.34375, "grad_norm_var": 0.002191019058227539, "learning_rate": 0.0001, "loss": 1.5212, "loss/crossentropy": 2.706373453140259, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22430507838726044, "step": 9745 }, { "epoch": 0.14552893481360918, "grad_norm": 0.322265625, "grad_norm_var": 0.002175633112589518, "learning_rate": 0.0001, "loss": 1.5137, "loss/crossentropy": 2.620542883872986, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2129211574792862, "step": 9746 }, { "epoch": 0.14554386698422417, "grad_norm": 0.3046875, "grad_norm_var": 0.002057377497355143, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.3920156955718994, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20258252322673798, "step": 9747 }, { "epoch": 0.14555879915483913, "grad_norm": 0.34375, "grad_norm_var": 0.00182037353515625, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.5672428607940674, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2050715833902359, "step": 9748 }, { "epoch": 0.14557373132545412, "grad_norm": 0.390625, "grad_norm_var": 0.0009099324544270833, "learning_rate": 0.0001, "loss": 1.5223, "loss/crossentropy": 2.542309880256653, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.20588800311088562, "step": 9749 }, { "epoch": 0.1455886634960691, "grad_norm": 0.3125, "grad_norm_var": 0.0008034865061442057, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.7966610193252563, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19924001395702362, "step": 9750 }, { "epoch": 0.1456035956666841, "grad_norm": 0.333984375, "grad_norm_var": 0.0007958571116129557, "learning_rate": 0.0001, "loss": 1.6069, "loss/crossentropy": 2.5513018369674683, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2241293266415596, "step": 9751 }, { "epoch": 0.14561852783729906, "grad_norm": 0.296875, "grad_norm_var": 0.0008571465810139974, "learning_rate": 0.0001, "loss": 1.3826, "loss/crossentropy": 2.5841680765151978, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1833566129207611, "step": 9752 }, { "epoch": 0.14563346000791405, "grad_norm": 0.32421875, "grad_norm_var": 0.0008433024088541667, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.746025323867798, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19197487086057663, "step": 9753 }, { "epoch": 0.14564839217852904, "grad_norm": 0.33203125, "grad_norm_var": 0.0007557551066080729, "learning_rate": 0.0001, "loss": 1.5094, "loss/crossentropy": 2.435897707939148, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22814105451107025, "step": 9754 }, { "epoch": 0.145663324349144, "grad_norm": 0.32421875, "grad_norm_var": 0.0007517496744791667, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.51952588558197, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.2649959400296211, "step": 9755 }, { "epoch": 0.145678256519759, "grad_norm": 0.291015625, "grad_norm_var": 0.0008005777994791667, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.4755663871765137, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.17898137122392654, "step": 9756 }, { "epoch": 0.14569318869037398, "grad_norm": 0.318359375, "grad_norm_var": 0.0005889892578125, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.588122248649597, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18287934362888336, "step": 9757 }, { "epoch": 0.14570812086098897, "grad_norm": 0.5, "grad_norm_var": 0.002498817443847656, "learning_rate": 0.0001, "loss": 1.7385, "loss/crossentropy": 2.4882001876831055, "loss/fcd": 1.48828125, "loss/idx": 11.0, "loss/logits": 0.2501769959926605, "step": 9758 }, { "epoch": 0.14572305303160393, "grad_norm": 0.35546875, "grad_norm_var": 0.002498817443847656, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.6858569383621216, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.20636172592639923, "step": 9759 }, { "epoch": 0.14573798520221892, "grad_norm": 0.33984375, "grad_norm_var": 0.0024047215779622396, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.4426170587539673, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2076948955655098, "step": 9760 }, { "epoch": 0.1457529173728339, "grad_norm": 0.306640625, "grad_norm_var": 0.002470254898071289, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.6242235898971558, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18697883188724518, "step": 9761 }, { "epoch": 0.14576784954344887, "grad_norm": 0.287109375, "grad_norm_var": 0.0026178836822509767, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.46648108959198, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20773327350616455, "step": 9762 }, { "epoch": 0.14578278171406386, "grad_norm": 0.294921875, "grad_norm_var": 0.002663421630859375, "learning_rate": 0.0001, "loss": 1.533, "loss/crossentropy": 2.6015610694885254, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22053544223308563, "step": 9763 }, { "epoch": 0.14579771388467885, "grad_norm": 0.298828125, "grad_norm_var": 0.0027339776357014974, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.360832452774048, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16970086097717285, "step": 9764 }, { "epoch": 0.14581264605529382, "grad_norm": 0.32421875, "grad_norm_var": 0.0024875481923421223, "learning_rate": 0.0001, "loss": 1.4499, "loss/crossentropy": 2.749859094619751, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1959717720746994, "step": 9765 }, { "epoch": 0.1458275782259088, "grad_norm": 0.310546875, "grad_norm_var": 0.0024916966756184894, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.719935178756714, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1846430003643036, "step": 9766 }, { "epoch": 0.1458425103965238, "grad_norm": 0.3046875, "grad_norm_var": 0.002519591649373372, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.4985448122024536, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16899675875902176, "step": 9767 }, { "epoch": 0.14585744256713878, "grad_norm": 0.33203125, "grad_norm_var": 0.002462371190388997, "learning_rate": 0.0001, "loss": 1.5115, "loss/crossentropy": 2.5781091451644897, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.20295238494873047, "step": 9768 }, { "epoch": 0.14587237473775375, "grad_norm": 0.287109375, "grad_norm_var": 0.002565956115722656, "learning_rate": 0.0001, "loss": 1.4084, "loss/crossentropy": 2.446441650390625, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18964842706918716, "step": 9769 }, { "epoch": 0.14588730690836874, "grad_norm": 0.283203125, "grad_norm_var": 0.0026720523834228515, "learning_rate": 0.0001, "loss": 1.4929, "loss/crossentropy": 2.6982632875442505, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2233433648943901, "step": 9770 }, { "epoch": 0.14590223907898373, "grad_norm": 0.314453125, "grad_norm_var": 0.002675628662109375, "learning_rate": 0.0001, "loss": 1.4737, "loss/crossentropy": 2.6775548458099365, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22365055978298187, "step": 9771 }, { "epoch": 0.1459171712495987, "grad_norm": 0.3359375, "grad_norm_var": 0.002617502212524414, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.4564335346221924, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21289193630218506, "step": 9772 }, { "epoch": 0.14593210342021368, "grad_norm": 0.33203125, "grad_norm_var": 0.0026178359985351562, "learning_rate": 0.0001, "loss": 1.4186, "loss/crossentropy": 3.0044384002685547, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19202201813459396, "step": 9773 }, { "epoch": 0.14594703559082867, "grad_norm": 0.3203125, "grad_norm_var": 0.00045363108317057293, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.7421627044677734, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20631658285856247, "step": 9774 }, { "epoch": 0.14596196776144363, "grad_norm": 0.3203125, "grad_norm_var": 0.0003374735514322917, "learning_rate": 0.0001, "loss": 1.5345, "loss/crossentropy": 2.5565340518951416, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22594991326332092, "step": 9775 }, { "epoch": 0.14597689993205862, "grad_norm": 0.318359375, "grad_norm_var": 0.0002865950266520182, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.641120433807373, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2055940181016922, "step": 9776 }, { "epoch": 0.1459918321026736, "grad_norm": 0.318359375, "grad_norm_var": 0.00028888384501139325, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.3397361040115356, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.19708409905433655, "step": 9777 }, { "epoch": 0.1460067642732886, "grad_norm": 0.306640625, "grad_norm_var": 0.0002494653065999349, "learning_rate": 0.0001, "loss": 1.4169, "loss/crossentropy": 2.732629895210266, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1903054267168045, "step": 9778 }, { "epoch": 0.14602169644390356, "grad_norm": 0.3828125, "grad_norm_var": 0.0005248387654622396, "learning_rate": 0.0001, "loss": 1.6767, "loss/crossentropy": 2.66701602935791, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.27822447568178177, "step": 9779 }, { "epoch": 0.14603662861451855, "grad_norm": 0.396484375, "grad_norm_var": 0.0008697509765625, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.5239278078079224, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2120424285531044, "step": 9780 }, { "epoch": 0.14605156078513354, "grad_norm": 0.298828125, "grad_norm_var": 0.0009100437164306641, "learning_rate": 0.0001, "loss": 1.4804, "loss/crossentropy": 2.6983749866485596, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20693161338567734, "step": 9781 }, { "epoch": 0.1460664929557485, "grad_norm": 0.375, "grad_norm_var": 0.001065826416015625, "learning_rate": 0.0001, "loss": 1.7774, "loss/crossentropy": 2.7072341442108154, "loss/fcd": 1.46484375, "loss/idx": 11.0, "loss/logits": 0.3125927299261093, "step": 9782 }, { "epoch": 0.1460814251263635, "grad_norm": 0.3125, "grad_norm_var": 0.0010467529296875, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.699126362800598, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19959862530231476, "step": 9783 }, { "epoch": 0.14609635729697848, "grad_norm": 0.33203125, "grad_norm_var": 0.0010467529296875, "learning_rate": 0.0001, "loss": 1.5085, "loss/crossentropy": 2.5723847150802612, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21947412937879562, "step": 9784 }, { "epoch": 0.14611128946759347, "grad_norm": 0.33203125, "grad_norm_var": 0.0009330590565999349, "learning_rate": 0.0001, "loss": 1.4224, "loss/crossentropy": 2.8802120685577393, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19978175312280655, "step": 9785 }, { "epoch": 0.14612622163820843, "grad_norm": 0.283203125, "grad_norm_var": 0.0009330590565999349, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.6377800703048706, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19218068569898605, "step": 9786 }, { "epoch": 0.14614115380882342, "grad_norm": 0.34765625, "grad_norm_var": 0.0009333292643229166, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.6882903575897217, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.17927618324756622, "step": 9787 }, { "epoch": 0.1461560859794384, "grad_norm": 0.310546875, "grad_norm_var": 0.000960397720336914, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.4785808324813843, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19030292332172394, "step": 9788 }, { "epoch": 0.14617101815005337, "grad_norm": 0.326171875, "grad_norm_var": 0.0009613037109375, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.597024917602539, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20755094289779663, "step": 9789 }, { "epoch": 0.14618595032066836, "grad_norm": 0.298828125, "grad_norm_var": 0.0010181268056233723, "learning_rate": 0.0001, "loss": 1.3545, "loss/crossentropy": 2.6081193685531616, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17479082942008972, "step": 9790 }, { "epoch": 0.14620088249128335, "grad_norm": 0.2734375, "grad_norm_var": 0.0012080987294514974, "learning_rate": 0.0001, "loss": 1.3629, "loss/crossentropy": 2.4896163940429688, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18325775116682053, "step": 9791 }, { "epoch": 0.1462158146618983, "grad_norm": 0.279296875, "grad_norm_var": 0.0013422489166259766, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.6550400257110596, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19013546407222748, "step": 9792 }, { "epoch": 0.1462307468325133, "grad_norm": 0.36328125, "grad_norm_var": 0.0014383951822916667, "learning_rate": 0.0001, "loss": 1.6724, "loss/crossentropy": 2.4405115842819214, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.23097429424524307, "step": 9793 }, { "epoch": 0.1462456790031283, "grad_norm": 0.33203125, "grad_norm_var": 0.0014125665028889975, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.5658401250839233, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20313285291194916, "step": 9794 }, { "epoch": 0.14626061117374328, "grad_norm": 0.345703125, "grad_norm_var": 0.0012262344360351562, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.888078808784485, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21074560284614563, "step": 9795 }, { "epoch": 0.14627554334435824, "grad_norm": 0.28125, "grad_norm_var": 0.0009645938873291016, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.616087317466736, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18177592009305954, "step": 9796 }, { "epoch": 0.14629047551497323, "grad_norm": 0.28515625, "grad_norm_var": 0.00101165771484375, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.4589526653289795, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.199289970099926, "step": 9797 }, { "epoch": 0.14630540768558822, "grad_norm": 0.28125, "grad_norm_var": 0.00084075927734375, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.500062942504883, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17633502930402756, "step": 9798 }, { "epoch": 0.14632033985620319, "grad_norm": 0.310546875, "grad_norm_var": 0.0008407433827718099, "learning_rate": 0.0001, "loss": 1.359, "loss/crossentropy": 2.450695276260376, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.16367372125387192, "step": 9799 }, { "epoch": 0.14633527202681818, "grad_norm": 0.361328125, "grad_norm_var": 0.0009749730428059896, "learning_rate": 0.0001, "loss": 1.6293, "loss/crossentropy": 2.550660014152527, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.24260997772216797, "step": 9800 }, { "epoch": 0.14635020419743316, "grad_norm": 0.373046875, "grad_norm_var": 0.0011829217274983725, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.6224762201309204, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2280440330505371, "step": 9801 }, { "epoch": 0.14636513636804815, "grad_norm": 0.31640625, "grad_norm_var": 0.0011075337727864583, "learning_rate": 0.0001, "loss": 1.2568, "loss/crossentropy": 2.434427499771118, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1513405591249466, "step": 9802 }, { "epoch": 0.14638006853866312, "grad_norm": 0.310546875, "grad_norm_var": 0.0010462284088134765, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.567965269088745, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20131678134202957, "step": 9803 }, { "epoch": 0.1463950007092781, "grad_norm": 0.30078125, "grad_norm_var": 0.0010587056477864584, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.657173275947571, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21741922199726105, "step": 9804 }, { "epoch": 0.1464099328798931, "grad_norm": 0.388671875, "grad_norm_var": 0.0013964335123697917, "learning_rate": 0.0001, "loss": 1.5189, "loss/crossentropy": 2.4574692249298096, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2064191699028015, "step": 9805 }, { "epoch": 0.14642486505050806, "grad_norm": 0.357421875, "grad_norm_var": 0.001454607645670573, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.883238434791565, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1978415995836258, "step": 9806 }, { "epoch": 0.14643979722112305, "grad_norm": 0.3125, "grad_norm_var": 0.001294390360514323, "learning_rate": 0.0001, "loss": 1.3091, "loss/crossentropy": 2.317423462867737, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.14898931980133057, "step": 9807 }, { "epoch": 0.14645472939173804, "grad_norm": 0.349609375, "grad_norm_var": 0.001175371805826823, "learning_rate": 0.0001, "loss": 1.5238, "loss/crossentropy": 2.649922728538513, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.199552021920681, "step": 9808 }, { "epoch": 0.146469661562353, "grad_norm": 0.3046875, "grad_norm_var": 0.0011248270670572917, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.6024614572525024, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.2397986203432083, "step": 9809 }, { "epoch": 0.146484593732968, "grad_norm": 0.291015625, "grad_norm_var": 0.0011952559153238932, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.6910319328308105, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19740907847881317, "step": 9810 }, { "epoch": 0.14649952590358298, "grad_norm": 0.32421875, "grad_norm_var": 0.0011594136555989583, "learning_rate": 0.0001, "loss": 1.4685, "loss/crossentropy": 2.3572133779525757, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.1950543150305748, "step": 9811 }, { "epoch": 0.14651445807419797, "grad_norm": 0.291015625, "grad_norm_var": 0.0011126041412353516, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.54998242855072, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19847725331783295, "step": 9812 }, { "epoch": 0.14652939024481293, "grad_norm": 0.322265625, "grad_norm_var": 0.0010144551595052083, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.709232807159424, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20935997366905212, "step": 9813 }, { "epoch": 0.14654432241542792, "grad_norm": 0.302734375, "grad_norm_var": 0.0009188175201416016, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.689836859703064, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1947418600320816, "step": 9814 }, { "epoch": 0.1465592545860429, "grad_norm": 0.302734375, "grad_norm_var": 0.0009387811024983724, "learning_rate": 0.0001, "loss": 1.4633, "loss/crossentropy": 2.426468253135681, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20162509381771088, "step": 9815 }, { "epoch": 0.14657418675665787, "grad_norm": 0.296875, "grad_norm_var": 0.0008910497029622396, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.753135561943054, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20295970886945724, "step": 9816 }, { "epoch": 0.14658911892727286, "grad_norm": 0.318359375, "grad_norm_var": 0.0007023493448893229, "learning_rate": 0.0001, "loss": 1.4802, "loss/crossentropy": 2.7326544523239136, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21452892571687698, "step": 9817 }, { "epoch": 0.14660405109788785, "grad_norm": 0.384765625, "grad_norm_var": 0.000978835423787435, "learning_rate": 0.0001, "loss": 1.6212, "loss/crossentropy": 2.4962666034698486, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.23054344952106476, "step": 9818 }, { "epoch": 0.14661898326850284, "grad_norm": 0.34375, "grad_norm_var": 0.000995318094889323, "learning_rate": 0.0001, "loss": 1.5956, "loss/crossentropy": 2.6268054246902466, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.24017058312892914, "step": 9819 }, { "epoch": 0.1466339154391178, "grad_norm": 0.31640625, "grad_norm_var": 0.0009612401326497395, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.5443536043167114, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19640769064426422, "step": 9820 }, { "epoch": 0.1466488476097328, "grad_norm": 0.359375, "grad_norm_var": 0.0007678826649983723, "learning_rate": 0.0001, "loss": 1.5421, "loss/crossentropy": 2.4844372272491455, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.21784856170415878, "step": 9821 }, { "epoch": 0.14666377978034778, "grad_norm": 0.322265625, "grad_norm_var": 0.0006866296132405599, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.6109379529953003, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21480904519557953, "step": 9822 }, { "epoch": 0.14667871195096274, "grad_norm": 0.33203125, "grad_norm_var": 0.000687265396118164, "learning_rate": 0.0001, "loss": 1.4854, "loss/crossentropy": 2.545221447944641, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21977780759334564, "step": 9823 }, { "epoch": 0.14669364412157773, "grad_norm": 0.296875, "grad_norm_var": 0.00067138671875, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.5852326154708862, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.16586222499608994, "step": 9824 }, { "epoch": 0.14670857629219272, "grad_norm": 0.27734375, "grad_norm_var": 0.0007715225219726562, "learning_rate": 0.0001, "loss": 1.2939, "loss/crossentropy": 2.523116707801819, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1689198836684227, "step": 9825 }, { "epoch": 0.14672350846280768, "grad_norm": 0.330078125, "grad_norm_var": 0.0007282892862955729, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 2.5039104223251343, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22328323125839233, "step": 9826 }, { "epoch": 0.14673844063342267, "grad_norm": 0.330078125, "grad_norm_var": 0.0007336775461832683, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.565016269683838, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21303047239780426, "step": 9827 }, { "epoch": 0.14675337280403766, "grad_norm": 0.29296875, "grad_norm_var": 0.0007262547810872396, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.5492690801620483, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20497232675552368, "step": 9828 }, { "epoch": 0.14676830497465265, "grad_norm": 0.322265625, "grad_norm_var": 0.0007262547810872396, "learning_rate": 0.0001, "loss": 1.5328, "loss/crossentropy": 2.6061477661132812, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.25152838230133057, "step": 9829 }, { "epoch": 0.14678323714526761, "grad_norm": 0.3359375, "grad_norm_var": 0.000716257095336914, "learning_rate": 0.0001, "loss": 1.5791, "loss/crossentropy": 2.728128433227539, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.23146354407072067, "step": 9830 }, { "epoch": 0.1467981693158826, "grad_norm": 0.345703125, "grad_norm_var": 0.0007176558176676432, "learning_rate": 0.0001, "loss": 1.7912, "loss/crossentropy": 2.5033565759658813, "loss/fcd": 1.48828125, "loss/idx": 11.0, "loss/logits": 0.3029298707842827, "step": 9831 }, { "epoch": 0.1468131014864976, "grad_norm": 0.302734375, "grad_norm_var": 0.0006975809733072917, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.6992733478546143, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1773584708571434, "step": 9832 }, { "epoch": 0.14682803365711256, "grad_norm": 0.345703125, "grad_norm_var": 0.0007176081339518229, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.5732502937316895, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.22415746003389359, "step": 9833 }, { "epoch": 0.14684296582772755, "grad_norm": 0.33203125, "grad_norm_var": 0.0004880110422770182, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.571555495262146, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.17905131727457047, "step": 9834 }, { "epoch": 0.14685789799834253, "grad_norm": 0.291015625, "grad_norm_var": 0.0005236307779947917, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.5834790468215942, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16229310631752014, "step": 9835 }, { "epoch": 0.1468728301689575, "grad_norm": 0.265625, "grad_norm_var": 0.0007145563761393229, "learning_rate": 0.0001, "loss": 1.3754, "loss/crossentropy": 2.5040171146392822, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19573017954826355, "step": 9836 }, { "epoch": 0.1468877623395725, "grad_norm": 0.2890625, "grad_norm_var": 0.0006321589152018229, "learning_rate": 0.0001, "loss": 1.34, "loss/crossentropy": 2.7713723182678223, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17200472950935364, "step": 9837 }, { "epoch": 0.14690269451018748, "grad_norm": 0.369140625, "grad_norm_var": 0.000825945536295573, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.5112507343292236, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.23590531200170517, "step": 9838 }, { "epoch": 0.14691762668080247, "grad_norm": 0.330078125, "grad_norm_var": 0.0008220513661702474, "learning_rate": 0.0001, "loss": 1.5586, "loss/crossentropy": 2.5097261667251587, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23051833361387253, "step": 9839 }, { "epoch": 0.14693255885141743, "grad_norm": 0.28515625, "grad_norm_var": 0.0008605798085530599, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.439372181892395, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17562198638916016, "step": 9840 }, { "epoch": 0.14694749102203242, "grad_norm": 0.51171875, "grad_norm_var": 0.0031074364980061848, "learning_rate": 0.0001, "loss": 1.6724, "loss/crossentropy": 2.318181872367859, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.21532107889652252, "step": 9841 }, { "epoch": 0.1469624231926474, "grad_norm": 0.31640625, "grad_norm_var": 0.003118896484375, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.6446175575256348, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1947791650891304, "step": 9842 }, { "epoch": 0.14697735536326237, "grad_norm": 0.294921875, "grad_norm_var": 0.003191566467285156, "learning_rate": 0.0001, "loss": 1.5323, "loss/crossentropy": 2.5974775552749634, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.23148221522569656, "step": 9843 }, { "epoch": 0.14699228753387736, "grad_norm": 0.373046875, "grad_norm_var": 0.003230015436808268, "learning_rate": 0.0001, "loss": 1.5377, "loss/crossentropy": 2.6134462356567383, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.23694493621587753, "step": 9844 }, { "epoch": 0.14700721970449235, "grad_norm": 0.37890625, "grad_norm_var": 0.003357696533203125, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.701570987701416, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20029717683792114, "step": 9845 }, { "epoch": 0.14702215187510734, "grad_norm": 0.28515625, "grad_norm_var": 0.0035155614217122397, "learning_rate": 0.0001, "loss": 1.3932, "loss/crossentropy": 2.4762675762176514, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19398167729377747, "step": 9846 }, { "epoch": 0.1470370840457223, "grad_norm": 0.4375, "grad_norm_var": 0.0042065779368082685, "learning_rate": 0.0001, "loss": 1.7083, "loss/crossentropy": 2.7838653326034546, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.26688338816165924, "step": 9847 }, { "epoch": 0.1470520162163373, "grad_norm": 0.3515625, "grad_norm_var": 0.004125912984212239, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.6568737030029297, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23290330171585083, "step": 9848 }, { "epoch": 0.14706694838695228, "grad_norm": 0.36328125, "grad_norm_var": 0.004156096776326498, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.610807418823242, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20046202838420868, "step": 9849 }, { "epoch": 0.14708188055756724, "grad_norm": 0.318359375, "grad_norm_var": 0.004186248779296875, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 2.649927496910095, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1789446696639061, "step": 9850 }, { "epoch": 0.14709681272818223, "grad_norm": 0.345703125, "grad_norm_var": 0.004006449381510417, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.3669137954711914, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20142105966806412, "step": 9851 }, { "epoch": 0.14711174489879722, "grad_norm": 0.310546875, "grad_norm_var": 0.0036587874094645184, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.5232003927230835, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1898534819483757, "step": 9852 }, { "epoch": 0.14712667706941218, "grad_norm": 0.3203125, "grad_norm_var": 0.0034761905670166017, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.28743577003479, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1861194670200348, "step": 9853 }, { "epoch": 0.14714160924002717, "grad_norm": 0.302734375, "grad_norm_var": 0.0035777886708577475, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.8354262113571167, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1978689804673195, "step": 9854 }, { "epoch": 0.14715654141064216, "grad_norm": 0.306640625, "grad_norm_var": 0.003659804662068685, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.5608749389648438, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19285883009433746, "step": 9855 }, { "epoch": 0.14717147358125715, "grad_norm": 0.7265625, "grad_norm_var": 0.012381601333618163, "learning_rate": 0.0001, "loss": 1.7641, "loss/crossentropy": 2.6403859853744507, "loss/fcd": 1.51171875, "loss/idx": 11.0, "loss/logits": 0.2523331791162491, "step": 9856 }, { "epoch": 0.1471864057518721, "grad_norm": 0.34375, "grad_norm_var": 0.011003732681274414, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.649690866470337, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.26545627415180206, "step": 9857 }, { "epoch": 0.1472013379224871, "grad_norm": 0.359375, "grad_norm_var": 0.010863860448201498, "learning_rate": 0.0001, "loss": 1.5381, "loss/crossentropy": 2.5567922592163086, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.22167565673589706, "step": 9858 }, { "epoch": 0.1472162700931021, "grad_norm": 0.30078125, "grad_norm_var": 0.010812314351399739, "learning_rate": 0.0001, "loss": 1.4896, "loss/crossentropy": 2.6259554624557495, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22006332129240036, "step": 9859 }, { "epoch": 0.14723120226371705, "grad_norm": 0.314453125, "grad_norm_var": 0.010956319173177083, "learning_rate": 0.0001, "loss": 1.4904, "loss/crossentropy": 2.34031879901886, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21308617293834686, "step": 9860 }, { "epoch": 0.14724613443433204, "grad_norm": 0.287109375, "grad_norm_var": 0.011255884170532226, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.4902535676956177, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21143274754285812, "step": 9861 }, { "epoch": 0.14726106660494703, "grad_norm": 0.33984375, "grad_norm_var": 0.010936339696248373, "learning_rate": 0.0001, "loss": 1.4412, "loss/crossentropy": 2.6185864210128784, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20685788244009018, "step": 9862 }, { "epoch": 0.14727599877556202, "grad_norm": 0.365234375, "grad_norm_var": 0.010497029622395833, "learning_rate": 0.0001, "loss": 1.7208, "loss/crossentropy": 2.4553571939468384, "loss/fcd": 1.46875, "loss/idx": 11.0, "loss/logits": 0.25205718725919724, "step": 9863 }, { "epoch": 0.14729093094617698, "grad_norm": 0.3359375, "grad_norm_var": 0.010516357421875, "learning_rate": 0.0001, "loss": 1.487, "loss/crossentropy": 2.618408441543579, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2135610431432724, "step": 9864 }, { "epoch": 0.14730586311679197, "grad_norm": 0.345703125, "grad_norm_var": 0.010510492324829101, "learning_rate": 0.0001, "loss": 1.4312, "loss/crossentropy": 2.5808955430984497, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.18904031068086624, "step": 9865 }, { "epoch": 0.14732079528740696, "grad_norm": 0.31640625, "grad_norm_var": 0.01051934560139974, "learning_rate": 0.0001, "loss": 1.4517, "loss/crossentropy": 2.501280188560486, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.1899447739124298, "step": 9866 }, { "epoch": 0.14733572745802193, "grad_norm": 0.2890625, "grad_norm_var": 0.010762262344360351, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.620250105857849, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.23073910176753998, "step": 9867 }, { "epoch": 0.14735065962863692, "grad_norm": 0.34765625, "grad_norm_var": 0.010664113362630208, "learning_rate": 0.0001, "loss": 1.5198, "loss/crossentropy": 2.5372570753097534, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22296860069036484, "step": 9868 }, { "epoch": 0.1473655917992519, "grad_norm": 0.30859375, "grad_norm_var": 0.010719235738118489, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.5715914964675903, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16767814755439758, "step": 9869 }, { "epoch": 0.14738052396986687, "grad_norm": 0.34375, "grad_norm_var": 0.010569365819295247, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.7033859491348267, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1677439734339714, "step": 9870 }, { "epoch": 0.14739545614048186, "grad_norm": 0.365234375, "grad_norm_var": 0.010430129369099934, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.3753950595855713, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2181270867586136, "step": 9871 }, { "epoch": 0.14741038831109685, "grad_norm": 0.29296875, "grad_norm_var": 0.0007335503896077473, "learning_rate": 0.0001, "loss": 1.3324, "loss/crossentropy": 2.5320979356765747, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17219921201467514, "step": 9872 }, { "epoch": 0.14742532048171184, "grad_norm": 0.30859375, "grad_norm_var": 0.0007392724355061848, "learning_rate": 0.0001, "loss": 1.3926, "loss/crossentropy": 2.9235342741012573, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18552656471729279, "step": 9873 }, { "epoch": 0.1474402526523268, "grad_norm": 0.27734375, "grad_norm_var": 0.0007980187733968099, "learning_rate": 0.0001, "loss": 1.3296, "loss/crossentropy": 2.698441982269287, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18505119532346725, "step": 9874 }, { "epoch": 0.1474551848229418, "grad_norm": 0.310546875, "grad_norm_var": 0.000777435302734375, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.7318607568740845, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19871244579553604, "step": 9875 }, { "epoch": 0.14747011699355678, "grad_norm": 0.31640625, "grad_norm_var": 0.0007757663726806641, "learning_rate": 0.0001, "loss": 1.4353, "loss/crossentropy": 2.625616669654846, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21268445253372192, "step": 9876 }, { "epoch": 0.14748504916417174, "grad_norm": 0.31640625, "grad_norm_var": 0.000693511962890625, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.4522628784179688, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1965121254324913, "step": 9877 }, { "epoch": 0.14749998133478673, "grad_norm": 0.302734375, "grad_norm_var": 0.0006998538970947266, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.542954683303833, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1641751453280449, "step": 9878 }, { "epoch": 0.14751491350540172, "grad_norm": 0.3203125, "grad_norm_var": 0.0005634943644205729, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.5242722034454346, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2281748354434967, "step": 9879 }, { "epoch": 0.1475298456760167, "grad_norm": 0.35546875, "grad_norm_var": 0.000632476806640625, "learning_rate": 0.0001, "loss": 1.501, "loss/crossentropy": 2.7147376537323, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21581315994262695, "step": 9880 }, { "epoch": 0.14754477784663167, "grad_norm": 0.296875, "grad_norm_var": 0.0006130059560139974, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.578213930130005, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1872824802994728, "step": 9881 }, { "epoch": 0.14755971001724666, "grad_norm": 0.423828125, "grad_norm_var": 0.0013289769490559896, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.4925583600997925, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.1815587282180786, "step": 9882 }, { "epoch": 0.14757464218786165, "grad_norm": 0.27734375, "grad_norm_var": 0.0013913472493489584, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.5563313961029053, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.1677457019686699, "step": 9883 }, { "epoch": 0.1475895743584766, "grad_norm": 0.2734375, "grad_norm_var": 0.0014891942342122396, "learning_rate": 0.0001, "loss": 1.3174, "loss/crossentropy": 2.6608316898345947, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1807001829147339, "step": 9884 }, { "epoch": 0.1476045065290916, "grad_norm": 0.294921875, "grad_norm_var": 0.00151823361714681, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.7623804807662964, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.15700393915176392, "step": 9885 }, { "epoch": 0.1476194386997066, "grad_norm": 0.3125, "grad_norm_var": 0.0014688968658447266, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.618007779121399, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22994154691696167, "step": 9886 }, { "epoch": 0.14763437087032155, "grad_norm": 0.369140625, "grad_norm_var": 0.0014958540598551432, "learning_rate": 0.0001, "loss": 1.504, "loss/crossentropy": 2.5262789726257324, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21494438499212265, "step": 9887 }, { "epoch": 0.14764930304093654, "grad_norm": 0.330078125, "grad_norm_var": 0.001470184326171875, "learning_rate": 0.0001, "loss": 1.4927, "loss/crossentropy": 2.81948983669281, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21923279762268066, "step": 9888 }, { "epoch": 0.14766423521155153, "grad_norm": 0.29296875, "grad_norm_var": 0.0015047709147135416, "learning_rate": 0.0001, "loss": 1.3519, "loss/crossentropy": 2.902293562889099, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19178561866283417, "step": 9889 }, { "epoch": 0.14767916738216652, "grad_norm": 0.326171875, "grad_norm_var": 0.0013962904612223306, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.8349883556365967, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.23602183908224106, "step": 9890 }, { "epoch": 0.14769409955278148, "grad_norm": 0.3046875, "grad_norm_var": 0.0014057795206705729, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.3126381635665894, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19841470569372177, "step": 9891 }, { "epoch": 0.14770903172339647, "grad_norm": 0.306640625, "grad_norm_var": 0.0014158725738525391, "learning_rate": 0.0001, "loss": 1.3002, "loss/crossentropy": 2.6698659658432007, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.15955184400081635, "step": 9892 }, { "epoch": 0.14772396389401146, "grad_norm": 0.306640625, "grad_norm_var": 0.0014251708984375, "learning_rate": 0.0001, "loss": 1.5396, "loss/crossentropy": 2.6265307664871216, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2270624339580536, "step": 9893 }, { "epoch": 0.14773889606462642, "grad_norm": 0.310546875, "grad_norm_var": 0.0014127095540364583, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.6787526607513428, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21206849068403244, "step": 9894 }, { "epoch": 0.1477538282352414, "grad_norm": 0.36328125, "grad_norm_var": 0.0015364964803059897, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.661368489265442, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21730928122997284, "step": 9895 }, { "epoch": 0.1477687604058564, "grad_norm": 0.33203125, "grad_norm_var": 0.0014647801717122396, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.4530338048934937, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1974365934729576, "step": 9896 }, { "epoch": 0.14778369257647136, "grad_norm": 0.296875, "grad_norm_var": 0.0014647801717122396, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.624413847923279, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19036821275949478, "step": 9897 }, { "epoch": 0.14779862474708635, "grad_norm": 0.306640625, "grad_norm_var": 0.0007018407185872395, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.725583791732788, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1839890033006668, "step": 9898 }, { "epoch": 0.14781355691770134, "grad_norm": 0.29296875, "grad_norm_var": 0.0006433486938476563, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.489895224571228, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19170641154050827, "step": 9899 }, { "epoch": 0.14782848908831633, "grad_norm": 0.34765625, "grad_norm_var": 0.0005889892578125, "learning_rate": 0.0001, "loss": 1.5812, "loss/crossentropy": 2.627989411354065, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.24521686136722565, "step": 9900 }, { "epoch": 0.1478434212589313, "grad_norm": 0.302734375, "grad_norm_var": 0.000568389892578125, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.544512629508972, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18158023059368134, "step": 9901 }, { "epoch": 0.14785835342954629, "grad_norm": 0.3046875, "grad_norm_var": 0.0005788167317708333, "learning_rate": 0.0001, "loss": 1.3494, "loss/crossentropy": 2.509726643562317, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17357901483774185, "step": 9902 }, { "epoch": 0.14787328560016127, "grad_norm": 0.330078125, "grad_norm_var": 0.000409698486328125, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.330078601837158, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20384352654218674, "step": 9903 }, { "epoch": 0.14788821777077624, "grad_norm": 0.3203125, "grad_norm_var": 0.0003972212473551432, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.8315560817718506, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19983197003602982, "step": 9904 }, { "epoch": 0.14790314994139123, "grad_norm": 0.31640625, "grad_norm_var": 0.0003617445627848307, "learning_rate": 0.0001, "loss": 1.5015, "loss/crossentropy": 2.7554045915603638, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21633779257535934, "step": 9905 }, { "epoch": 0.14791808211200622, "grad_norm": 0.310546875, "grad_norm_var": 0.0003574212392171224, "learning_rate": 0.0001, "loss": 1.4259, "loss/crossentropy": 2.7302169799804688, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19150236994028091, "step": 9906 }, { "epoch": 0.1479330142826212, "grad_norm": 0.33203125, "grad_norm_var": 0.0003636519114176432, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.4466500282287598, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.23342512547969818, "step": 9907 }, { "epoch": 0.14794794645323617, "grad_norm": 0.37109375, "grad_norm_var": 0.0005299250284830729, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.7629061937332153, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.195961594581604, "step": 9908 }, { "epoch": 0.14796287862385116, "grad_norm": 0.365234375, "grad_norm_var": 0.0006281534830729166, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.6044020652770996, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2199799045920372, "step": 9909 }, { "epoch": 0.14797781079446615, "grad_norm": 0.39453125, "grad_norm_var": 0.0009049574534098307, "learning_rate": 0.0001, "loss": 1.5995, "loss/crossentropy": 2.6834797859191895, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.2322736158967018, "step": 9910 }, { "epoch": 0.1479927429650811, "grad_norm": 0.423828125, "grad_norm_var": 0.0013991673787434897, "learning_rate": 0.0001, "loss": 1.6987, "loss/crossentropy": 2.847881555557251, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2533733993768692, "step": 9911 }, { "epoch": 0.1480076751356961, "grad_norm": 0.322265625, "grad_norm_var": 0.0014079888661702475, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.838471293449402, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21061835438013077, "step": 9912 }, { "epoch": 0.1480226073063111, "grad_norm": 0.291015625, "grad_norm_var": 0.0014388402303059896, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.682753086090088, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19125181436538696, "step": 9913 }, { "epoch": 0.14803753947692605, "grad_norm": 0.30078125, "grad_norm_var": 0.0014617760976155599, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.647762656211853, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19550549238920212, "step": 9914 }, { "epoch": 0.14805247164754104, "grad_norm": 0.3125, "grad_norm_var": 0.0013816674550374349, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.5292123556137085, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1982479840517044, "step": 9915 }, { "epoch": 0.14806740381815603, "grad_norm": 0.376953125, "grad_norm_var": 0.0014882405598958333, "learning_rate": 0.0001, "loss": 1.5543, "loss/crossentropy": 2.562419295310974, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22226238250732422, "step": 9916 }, { "epoch": 0.14808233598877102, "grad_norm": 0.310546875, "grad_norm_var": 0.0014574686686197917, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.6545623540878296, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19718679785728455, "step": 9917 }, { "epoch": 0.14809726815938598, "grad_norm": 0.42578125, "grad_norm_var": 0.0018615086873372396, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.705455780029297, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.25864802300930023, "step": 9918 }, { "epoch": 0.14811220033000097, "grad_norm": 0.310546875, "grad_norm_var": 0.0019215901692708334, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.501283645629883, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18416975438594818, "step": 9919 }, { "epoch": 0.14812713250061596, "grad_norm": 0.349609375, "grad_norm_var": 0.00188749631245931, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.566258668899536, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.24488143622875214, "step": 9920 }, { "epoch": 0.14814206467123092, "grad_norm": 0.291015625, "grad_norm_var": 0.0020232518513997396, "learning_rate": 0.0001, "loss": 1.3565, "loss/crossentropy": 2.7688212394714355, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18855369091033936, "step": 9921 }, { "epoch": 0.1481569968418459, "grad_norm": 0.359375, "grad_norm_var": 0.001960865656534831, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.6762309074401855, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2128671035170555, "step": 9922 }, { "epoch": 0.1481719290124609, "grad_norm": 0.318359375, "grad_norm_var": 0.001998138427734375, "learning_rate": 0.0001, "loss": 1.5336, "loss/crossentropy": 2.7308614253997803, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.29136957228183746, "step": 9923 }, { "epoch": 0.1481868611830759, "grad_norm": 0.326171875, "grad_norm_var": 0.0019692579905192056, "learning_rate": 0.0001, "loss": 1.4607, "loss/crossentropy": 2.3966351747512817, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.18338826298713684, "step": 9924 }, { "epoch": 0.14820179335369085, "grad_norm": 0.419921875, "grad_norm_var": 0.0023226261138916014, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.4935790300369263, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.21247775852680206, "step": 9925 }, { "epoch": 0.14821672552430584, "grad_norm": 0.314453125, "grad_norm_var": 0.002203369140625, "learning_rate": 0.0001, "loss": 1.4334, "loss/crossentropy": 2.712188959121704, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20292527973651886, "step": 9926 }, { "epoch": 0.14823165769492083, "grad_norm": 0.296875, "grad_norm_var": 0.0018056074778238932, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.7019898891448975, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18675809353590012, "step": 9927 }, { "epoch": 0.1482465898655358, "grad_norm": 0.341796875, "grad_norm_var": 0.0018017927805582682, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.760294556617737, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18880608677864075, "step": 9928 }, { "epoch": 0.14826152203615078, "grad_norm": 0.28125, "grad_norm_var": 0.001863861083984375, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.4194353818893433, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18470507115125656, "step": 9929 }, { "epoch": 0.14827645420676577, "grad_norm": 0.337890625, "grad_norm_var": 0.0017880598704020183, "learning_rate": 0.0001, "loss": 1.6114, "loss/crossentropy": 2.517609477043152, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.25206536054611206, "step": 9930 }, { "epoch": 0.14829138637738074, "grad_norm": 0.326171875, "grad_norm_var": 0.0017572402954101562, "learning_rate": 0.0001, "loss": 1.4303, "loss/crossentropy": 2.4689695835113525, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19979706406593323, "step": 9931 }, { "epoch": 0.14830631854799572, "grad_norm": 0.314453125, "grad_norm_var": 0.0016656875610351562, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.6075968742370605, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1852387636899948, "step": 9932 }, { "epoch": 0.14832125071861071, "grad_norm": 0.333984375, "grad_norm_var": 0.0016305923461914062, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.779491662979126, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18439516425132751, "step": 9933 }, { "epoch": 0.1483361828892257, "grad_norm": 0.333984375, "grad_norm_var": 0.001036691665649414, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.5592355728149414, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21342933177947998, "step": 9934 }, { "epoch": 0.14835111505984067, "grad_norm": 0.5390625, "grad_norm_var": 0.003753662109375, "learning_rate": 0.0001, "loss": 1.8287, "loss/crossentropy": 2.0809115767478943, "loss/fcd": 1.53515625, "loss/idx": 11.0, "loss/logits": 0.29355281591415405, "step": 9935 }, { "epoch": 0.14836604723045566, "grad_norm": 0.345703125, "grad_norm_var": 0.003751055399576823, "learning_rate": 0.0001, "loss": 1.5543, "loss/crossentropy": 2.59328556060791, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23787906765937805, "step": 9936 }, { "epoch": 0.14838097940107065, "grad_norm": 0.314453125, "grad_norm_var": 0.003624407450358073, "learning_rate": 0.0001, "loss": 1.5401, "loss/crossentropy": 2.3047218322753906, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2275879830121994, "step": 9937 }, { "epoch": 0.1483959115716856, "grad_norm": 0.3046875, "grad_norm_var": 0.003699175516764323, "learning_rate": 0.0001, "loss": 1.57, "loss/crossentropy": 2.6797441244125366, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.23402491211891174, "step": 9938 }, { "epoch": 0.1484108437423006, "grad_norm": 0.330078125, "grad_norm_var": 0.0036730448404947917, "learning_rate": 0.0001, "loss": 1.4869, "loss/crossentropy": 2.5646501779556274, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.23298382759094238, "step": 9939 }, { "epoch": 0.1484257759129156, "grad_norm": 0.30859375, "grad_norm_var": 0.0037278334299723306, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.6139670610427856, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.198575958609581, "step": 9940 }, { "epoch": 0.14844070808353058, "grad_norm": 0.380859375, "grad_norm_var": 0.003408034642537435, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.9837453365325928, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.22281581163406372, "step": 9941 }, { "epoch": 0.14845564025414554, "grad_norm": 0.34765625, "grad_norm_var": 0.00337371826171875, "learning_rate": 0.0001, "loss": 1.596, "loss/crossentropy": 2.837248682975769, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2639351934194565, "step": 9942 }, { "epoch": 0.14847057242476053, "grad_norm": 0.36328125, "grad_norm_var": 0.0032688776652018228, "learning_rate": 0.0001, "loss": 1.7117, "loss/crossentropy": 2.75117826461792, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.270314522087574, "step": 9943 }, { "epoch": 0.14848550459537552, "grad_norm": 0.349609375, "grad_norm_var": 0.003270403544108073, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.5963661670684814, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.25205111503601074, "step": 9944 }, { "epoch": 0.14850043676599048, "grad_norm": 0.33203125, "grad_norm_var": 0.0030034383138020835, "learning_rate": 0.0001, "loss": 1.4395, "loss/crossentropy": 2.6319910287857056, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20120927691459656, "step": 9945 }, { "epoch": 0.14851536893660547, "grad_norm": 0.333984375, "grad_norm_var": 0.0030094782511393228, "learning_rate": 0.0001, "loss": 1.5156, "loss/crossentropy": 2.4330555200576782, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21482612937688828, "step": 9946 }, { "epoch": 0.14853030110722046, "grad_norm": 0.275390625, "grad_norm_var": 0.0033144632975260417, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 2.642804265022278, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18134743720293045, "step": 9947 }, { "epoch": 0.14854523327783542, "grad_norm": 0.283203125, "grad_norm_var": 0.003499603271484375, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.548025369644165, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18617308884859085, "step": 9948 }, { "epoch": 0.1485601654484504, "grad_norm": 0.30078125, "grad_norm_var": 0.0036052544911702473, "learning_rate": 0.0001, "loss": 1.3232, "loss/crossentropy": 2.550156354904175, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17474640905857086, "step": 9949 }, { "epoch": 0.1485750976190654, "grad_norm": 0.318359375, "grad_norm_var": 0.003633483250935872, "learning_rate": 0.0001, "loss": 1.3488, "loss/crossentropy": 2.441872239112854, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17691317200660706, "step": 9950 }, { "epoch": 0.1485900297896804, "grad_norm": 0.8671875, "grad_norm_var": 0.01910513242085775, "learning_rate": 0.0001, "loss": 2.4531, "loss/crossentropy": 2.4709209203720093, "loss/fcd": 2.125, "loss/idx": 11.0, "loss/logits": 0.32806846499443054, "step": 9951 }, { "epoch": 0.14860496196029535, "grad_norm": 0.2890625, "grad_norm_var": 0.019411659240722655, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.6070648431777954, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17874959856271744, "step": 9952 }, { "epoch": 0.14861989413091034, "grad_norm": 0.322265625, "grad_norm_var": 0.019371986389160156, "learning_rate": 0.0001, "loss": 1.493, "loss/crossentropy": 2.6012221574783325, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22733820974826813, "step": 9953 }, { "epoch": 0.14863482630152533, "grad_norm": 0.296875, "grad_norm_var": 0.019429969787597656, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.6562470197677612, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1902240589261055, "step": 9954 }, { "epoch": 0.1486497584721403, "grad_norm": 0.302734375, "grad_norm_var": 0.019571940104166668, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.765205979347229, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16243147104978561, "step": 9955 }, { "epoch": 0.14866469064275528, "grad_norm": 0.330078125, "grad_norm_var": 0.019469308853149413, "learning_rate": 0.0001, "loss": 1.4667, "loss/crossentropy": 2.891927719116211, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21274696290493011, "step": 9956 }, { "epoch": 0.14867962281337027, "grad_norm": 0.34765625, "grad_norm_var": 0.01942742665608724, "learning_rate": 0.0001, "loss": 1.4161, "loss/crossentropy": 2.5581672191619873, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1973678320646286, "step": 9957 }, { "epoch": 0.14869455498398523, "grad_norm": 0.298828125, "grad_norm_var": 0.019616174697875976, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.5309109687805176, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19149737060070038, "step": 9958 }, { "epoch": 0.14870948715460022, "grad_norm": 0.390625, "grad_norm_var": 0.01970874468485514, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.8683555126190186, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.20924416929483414, "step": 9959 }, { "epoch": 0.1487244193252152, "grad_norm": 0.333984375, "grad_norm_var": 0.0197298526763916, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.6439303159713745, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22193047404289246, "step": 9960 }, { "epoch": 0.1487393514958302, "grad_norm": 0.328125, "grad_norm_var": 0.019740915298461913, "learning_rate": 0.0001, "loss": 1.4137, "loss/crossentropy": 2.478973150253296, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19105792045593262, "step": 9961 }, { "epoch": 0.14875428366644516, "grad_norm": 0.36328125, "grad_norm_var": 0.019727325439453124, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.43994402885437, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.19569196552038193, "step": 9962 }, { "epoch": 0.14876921583706015, "grad_norm": 0.298828125, "grad_norm_var": 0.01951904296875, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.6058154106140137, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.17995739728212357, "step": 9963 }, { "epoch": 0.14878414800767514, "grad_norm": 0.416015625, "grad_norm_var": 0.019359079996744792, "learning_rate": 0.0001, "loss": 1.4384, "loss/crossentropy": 2.7571537494659424, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2118477299809456, "step": 9964 }, { "epoch": 0.1487990801782901, "grad_norm": 0.287109375, "grad_norm_var": 0.01948380470275879, "learning_rate": 0.0001, "loss": 1.3738, "loss/crossentropy": 2.6057021617889404, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19017772376537323, "step": 9965 }, { "epoch": 0.1488140123489051, "grad_norm": 0.306640625, "grad_norm_var": 0.019560480117797853, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.6714959144592285, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1802227720618248, "step": 9966 }, { "epoch": 0.14882894451952008, "grad_norm": 0.287109375, "grad_norm_var": 0.0014566421508789063, "learning_rate": 0.0001, "loss": 1.3827, "loss/crossentropy": 2.635837197303772, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18735767900943756, "step": 9967 }, { "epoch": 0.14884387669013507, "grad_norm": 0.28125, "grad_norm_var": 0.0014978408813476562, "learning_rate": 0.0001, "loss": 1.3132, "loss/crossentropy": 2.4782058000564575, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16863833367824554, "step": 9968 }, { "epoch": 0.14885880886075004, "grad_norm": 0.291015625, "grad_norm_var": 0.0015680313110351563, "learning_rate": 0.0001, "loss": 1.3761, "loss/crossentropy": 2.525423765182495, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1846856251358986, "step": 9969 }, { "epoch": 0.14887374103136503, "grad_norm": 0.31640625, "grad_norm_var": 0.001525115966796875, "learning_rate": 0.0001, "loss": 1.3811, "loss/crossentropy": 2.7397522926330566, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19364453852176666, "step": 9970 }, { "epoch": 0.14888867320198002, "grad_norm": 0.283203125, "grad_norm_var": 0.0016036351521809896, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.425156354904175, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20795881003141403, "step": 9971 }, { "epoch": 0.14890360537259498, "grad_norm": 0.330078125, "grad_norm_var": 0.0016036351521809896, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.291429877281189, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.16799553483724594, "step": 9972 }, { "epoch": 0.14891853754320997, "grad_norm": 0.3359375, "grad_norm_var": 0.0015729268391927083, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.293518304824829, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.21517211943864822, "step": 9973 }, { "epoch": 0.14893346971382496, "grad_norm": 0.30078125, "grad_norm_var": 0.0015671888987223307, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.741959571838379, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19094091653823853, "step": 9974 }, { "epoch": 0.14894840188443992, "grad_norm": 0.37890625, "grad_norm_var": 0.0014683882395426433, "learning_rate": 0.0001, "loss": 1.6695, "loss/crossentropy": 2.4285502433776855, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.24761705100536346, "step": 9975 }, { "epoch": 0.1489633340550549, "grad_norm": 0.306640625, "grad_norm_var": 0.0014683882395426433, "learning_rate": 0.0001, "loss": 1.3557, "loss/crossentropy": 2.479637384414673, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18770696222782135, "step": 9976 }, { "epoch": 0.1489782662256699, "grad_norm": 0.310546875, "grad_norm_var": 0.0014673868815104167, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.3955576419830322, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20677809417247772, "step": 9977 }, { "epoch": 0.1489931983962849, "grad_norm": 0.32421875, "grad_norm_var": 0.0013287862141927083, "learning_rate": 0.0001, "loss": 1.4513, "loss/crossentropy": 2.5193278789520264, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20907491445541382, "step": 9978 }, { "epoch": 0.14900813056689985, "grad_norm": 0.314453125, "grad_norm_var": 0.001308441162109375, "learning_rate": 0.0001, "loss": 1.4773, "loss/crossentropy": 2.4523227214813232, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.1843232661485672, "step": 9979 }, { "epoch": 0.14902306273751484, "grad_norm": 0.3203125, "grad_norm_var": 0.0006160577138264974, "learning_rate": 0.0001, "loss": 1.4025, "loss/crossentropy": 2.609233260154724, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19152122735977173, "step": 9980 }, { "epoch": 0.14903799490812983, "grad_norm": 0.30078125, "grad_norm_var": 0.0005843480428059896, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.6205214262008667, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19936518371105194, "step": 9981 }, { "epoch": 0.1490529270787448, "grad_norm": 0.31640625, "grad_norm_var": 0.0005836327870686849, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 1.9974554181098938, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.1872815191745758, "step": 9982 }, { "epoch": 0.14906785924935978, "grad_norm": 0.283203125, "grad_norm_var": 0.0005977471669514974, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5353070497512817, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22445349395275116, "step": 9983 }, { "epoch": 0.14908279141997477, "grad_norm": 0.322265625, "grad_norm_var": 0.0005339940388997395, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.5086315870285034, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1903250813484192, "step": 9984 }, { "epoch": 0.14909772359058976, "grad_norm": 0.287109375, "grad_norm_var": 0.0005472819010416667, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.733398199081421, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1775510236620903, "step": 9985 }, { "epoch": 0.14911265576120472, "grad_norm": 0.30078125, "grad_norm_var": 0.0005584716796875, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.6512539386749268, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20894096791744232, "step": 9986 }, { "epoch": 0.1491275879318197, "grad_norm": 0.328125, "grad_norm_var": 0.0005032698313395183, "learning_rate": 0.0001, "loss": 1.6246, "loss/crossentropy": 2.3421237468719482, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2417861446738243, "step": 9987 }, { "epoch": 0.1491425201024347, "grad_norm": 0.59375, "grad_norm_var": 0.0053333918253580725, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.73013174533844, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2322331815958023, "step": 9988 }, { "epoch": 0.14915745227304966, "grad_norm": 0.369140625, "grad_norm_var": 0.005416345596313476, "learning_rate": 0.0001, "loss": 1.5246, "loss/crossentropy": 2.624979615211487, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21989969909191132, "step": 9989 }, { "epoch": 0.14917238444366465, "grad_norm": 0.333984375, "grad_norm_var": 0.00533447265625, "learning_rate": 0.0001, "loss": 1.3996, "loss/crossentropy": 2.7950432300567627, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19256138801574707, "step": 9990 }, { "epoch": 0.14918731661427964, "grad_norm": 0.283203125, "grad_norm_var": 0.00537107785542806, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.6103904247283936, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16714739799499512, "step": 9991 }, { "epoch": 0.1492022487848946, "grad_norm": 0.3359375, "grad_norm_var": 0.00532983144124349, "learning_rate": 0.0001, "loss": 1.336, "loss/crossentropy": 2.465618848800659, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17190426588058472, "step": 9992 }, { "epoch": 0.1492171809555096, "grad_norm": 0.333984375, "grad_norm_var": 0.00529473622639974, "learning_rate": 0.0001, "loss": 1.3335, "loss/crossentropy": 2.7096222639083862, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1850210651755333, "step": 9993 }, { "epoch": 0.14923211312612458, "grad_norm": 0.3515625, "grad_norm_var": 0.005304972330729167, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.837370753288269, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.22845004498958588, "step": 9994 }, { "epoch": 0.14924704529673957, "grad_norm": 0.29296875, "grad_norm_var": 0.005395364761352539, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 2.522329092025757, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1852220743894577, "step": 9995 }, { "epoch": 0.14926197746735453, "grad_norm": 0.30859375, "grad_norm_var": 0.005426263809204102, "learning_rate": 0.0001, "loss": 1.2355, "loss/crossentropy": 2.718490242958069, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.15348413586616516, "step": 9996 }, { "epoch": 0.14927690963796952, "grad_norm": 0.3125, "grad_norm_var": 0.005383157730102539, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.62522554397583, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21533286571502686, "step": 9997 }, { "epoch": 0.1492918418085845, "grad_norm": 0.296875, "grad_norm_var": 0.005454365412394206, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.54520320892334, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16270411014556885, "step": 9998 }, { "epoch": 0.14930677397919948, "grad_norm": 0.4375, "grad_norm_var": 0.005910174051920573, "learning_rate": 0.0001, "loss": 1.478, "loss/crossentropy": 2.799686908721924, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21240341663360596, "step": 9999 }, { "epoch": 0.14932170614981446, "grad_norm": 0.353515625, "grad_norm_var": 0.005884742736816407, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.6507344245910645, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21408677846193314, "step": 10000 }, { "epoch": 0.14933663832042945, "grad_norm": 0.3671875, "grad_norm_var": 0.005667734146118164, "learning_rate": 0.0001, "loss": 1.5553, "loss/crossentropy": 2.7623863220214844, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23112015426158905, "step": 10001 }, { "epoch": 0.14935157049104444, "grad_norm": 0.451171875, "grad_norm_var": 0.006094868977864583, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.52932870388031, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2773768827319145, "step": 10002 }, { "epoch": 0.1493665026616594, "grad_norm": 0.32421875, "grad_norm_var": 0.006112098693847656, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.7445285320281982, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19804996252059937, "step": 10003 }, { "epoch": 0.1493814348322744, "grad_norm": 0.30859375, "grad_norm_var": 0.002273813883463542, "learning_rate": 0.0001, "loss": 1.5192, "loss/crossentropy": 2.3756293058395386, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2457466721534729, "step": 10004 }, { "epoch": 0.14939636700288939, "grad_norm": 0.3203125, "grad_norm_var": 0.002241627375284831, "learning_rate": 0.0001, "loss": 1.3444, "loss/crossentropy": 2.6151636838912964, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1803642138838768, "step": 10005 }, { "epoch": 0.14941129917350435, "grad_norm": 0.37890625, "grad_norm_var": 0.0023421605428059896, "learning_rate": 0.0001, "loss": 1.6627, "loss/crossentropy": 2.2545249462127686, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.22129400074481964, "step": 10006 }, { "epoch": 0.14942623134411934, "grad_norm": 0.3125, "grad_norm_var": 0.00216978391011556, "learning_rate": 0.0001, "loss": 1.4483, "loss/crossentropy": 2.52389657497406, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21001185476779938, "step": 10007 }, { "epoch": 0.14944116351473433, "grad_norm": 0.296875, "grad_norm_var": 0.0023013909657796225, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.714744210243225, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17091868817806244, "step": 10008 }, { "epoch": 0.1494560956853493, "grad_norm": 0.294921875, "grad_norm_var": 0.0024304548899332684, "learning_rate": 0.0001, "loss": 1.5888, "loss/crossentropy": 2.506785750389099, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.25282804667949677, "step": 10009 }, { "epoch": 0.14947102785596428, "grad_norm": 0.333984375, "grad_norm_var": 0.0024180094401041667, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.731884002685547, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2119775265455246, "step": 10010 }, { "epoch": 0.14948596002657927, "grad_norm": 0.27734375, "grad_norm_var": 0.0025248209635416667, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.646640181541443, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2053777426481247, "step": 10011 }, { "epoch": 0.14950089219719426, "grad_norm": 0.65625, "grad_norm_var": 0.008811378479003906, "learning_rate": 0.0001, "loss": 1.6487, "loss/crossentropy": 2.5599491596221924, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.26982414722442627, "step": 10012 }, { "epoch": 0.14951582436780922, "grad_norm": 0.328125, "grad_norm_var": 0.00873254140218099, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.77813184261322, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20358280092477798, "step": 10013 }, { "epoch": 0.1495307565384242, "grad_norm": 0.318359375, "grad_norm_var": 0.00858445167541504, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.5969340801239014, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22035370767116547, "step": 10014 }, { "epoch": 0.1495456887090392, "grad_norm": 0.296875, "grad_norm_var": 0.008367013931274415, "learning_rate": 0.0001, "loss": 1.4353, "loss/crossentropy": 2.7083898782730103, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2048574462532997, "step": 10015 }, { "epoch": 0.14956062087965416, "grad_norm": 0.345703125, "grad_norm_var": 0.008368412653605143, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.4626379013061523, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18463688343763351, "step": 10016 }, { "epoch": 0.14957555305026915, "grad_norm": 0.380859375, "grad_norm_var": 0.008410135904947916, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.6466526985168457, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1849283128976822, "step": 10017 }, { "epoch": 0.14959048522088414, "grad_norm": 0.31640625, "grad_norm_var": 0.007755390803019206, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.6703191995620728, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.16590508818626404, "step": 10018 }, { "epoch": 0.1496054173914991, "grad_norm": 0.341796875, "grad_norm_var": 0.007730356852213542, "learning_rate": 0.0001, "loss": 1.5988, "loss/crossentropy": 2.9094265699386597, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.26285264641046524, "step": 10019 }, { "epoch": 0.1496203495621141, "grad_norm": 0.2890625, "grad_norm_var": 0.007847023010253907, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.5999271869659424, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19184299558401108, "step": 10020 }, { "epoch": 0.14963528173272908, "grad_norm": 0.384765625, "grad_norm_var": 0.00791153907775879, "learning_rate": 0.0001, "loss": 1.5993, "loss/crossentropy": 2.3658992052078247, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.23211021721363068, "step": 10021 }, { "epoch": 0.14965021390334407, "grad_norm": 0.34765625, "grad_norm_var": 0.007839822769165039, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.6702195405960083, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21070361137390137, "step": 10022 }, { "epoch": 0.14966514607395903, "grad_norm": 0.33984375, "grad_norm_var": 0.007767724990844727, "learning_rate": 0.0001, "loss": 1.4039, "loss/crossentropy": 2.5785582065582275, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19295664876699448, "step": 10023 }, { "epoch": 0.14968007824457402, "grad_norm": 0.29296875, "grad_norm_var": 0.007794682184855143, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.6928786039352417, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17544914036989212, "step": 10024 }, { "epoch": 0.149695010415189, "grad_norm": 0.27734375, "grad_norm_var": 0.007935015360514323, "learning_rate": 0.0001, "loss": 1.4861, "loss/crossentropy": 2.4136310815811157, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2595192939043045, "step": 10025 }, { "epoch": 0.14970994258580397, "grad_norm": 0.37109375, "grad_norm_var": 0.007964309056599934, "learning_rate": 0.0001, "loss": 1.6696, "loss/crossentropy": 2.7122639417648315, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2828778624534607, "step": 10026 }, { "epoch": 0.14972487475641896, "grad_norm": 0.294921875, "grad_norm_var": 0.00781853993733724, "learning_rate": 0.0001, "loss": 1.4512, "loss/crossentropy": 2.6492422819137573, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2168293222784996, "step": 10027 }, { "epoch": 0.14973980692703395, "grad_norm": 0.341796875, "grad_norm_var": 0.0011113325754801432, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.3788729906082153, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1864946484565735, "step": 10028 }, { "epoch": 0.14975473909764894, "grad_norm": 0.375, "grad_norm_var": 0.0012417952219645181, "learning_rate": 0.0001, "loss": 1.6613, "loss/crossentropy": 2.2841047048568726, "loss/fcd": 1.4296875, "loss/idx": 11.0, "loss/logits": 0.23159711807966232, "step": 10029 }, { "epoch": 0.1497696712682639, "grad_norm": 0.42578125, "grad_norm_var": 0.00176544189453125, "learning_rate": 0.0001, "loss": 1.6396, "loss/crossentropy": 2.8491424322128296, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.26459722220897675, "step": 10030 }, { "epoch": 0.1497846034388789, "grad_norm": 0.396484375, "grad_norm_var": 0.0018278598785400391, "learning_rate": 0.0001, "loss": 1.752, "loss/crossentropy": 2.376920700073242, "loss/fcd": 1.4921875, "loss/idx": 11.0, "loss/logits": 0.25979380309581757, "step": 10031 }, { "epoch": 0.14979953560949388, "grad_norm": 0.302734375, "grad_norm_var": 0.0019397576649983724, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.757275342941284, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.16632820665836334, "step": 10032 }, { "epoch": 0.14981446778010885, "grad_norm": 0.3515625, "grad_norm_var": 0.0018431981404622396, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.6361151933670044, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22971054911613464, "step": 10033 }, { "epoch": 0.14982939995072383, "grad_norm": 0.337890625, "grad_norm_var": 0.001802810033162435, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.706793189048767, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2053312435746193, "step": 10034 }, { "epoch": 0.14984433212133882, "grad_norm": 0.3203125, "grad_norm_var": 0.0018320083618164062, "learning_rate": 0.0001, "loss": 1.2934, "loss/crossentropy": 2.838751196861267, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1684350222349167, "step": 10035 }, { "epoch": 0.1498592642919538, "grad_norm": 0.30078125, "grad_norm_var": 0.001760101318359375, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.5991551876068115, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20347173511981964, "step": 10036 }, { "epoch": 0.14987419646256878, "grad_norm": 0.328125, "grad_norm_var": 0.0016324202219645183, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.521088480949402, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1936551108956337, "step": 10037 }, { "epoch": 0.14988912863318377, "grad_norm": 0.54296875, "grad_norm_var": 0.00427409807840983, "learning_rate": 0.0001, "loss": 1.8489, "loss/crossentropy": 2.650709390640259, "loss/fcd": 1.51171875, "loss/idx": 11.0, "loss/logits": 0.3371947407722473, "step": 10038 }, { "epoch": 0.14990406080379876, "grad_norm": 0.375, "grad_norm_var": 0.004303852717081706, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.3592958450317383, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.21648437529802322, "step": 10039 }, { "epoch": 0.14991899297441372, "grad_norm": 0.318359375, "grad_norm_var": 0.004143714904785156, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.6978588104248047, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19462747871875763, "step": 10040 }, { "epoch": 0.1499339251450287, "grad_norm": 0.345703125, "grad_norm_var": 0.0037392775217692056, "learning_rate": 0.0001, "loss": 1.526, "loss/crossentropy": 2.4393773078918457, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2134810984134674, "step": 10041 }, { "epoch": 0.1499488573156437, "grad_norm": 0.3515625, "grad_norm_var": 0.003729104995727539, "learning_rate": 0.0001, "loss": 1.3207, "loss/crossentropy": 2.5357311964035034, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.16445133090019226, "step": 10042 }, { "epoch": 0.14996378948625866, "grad_norm": 0.5625, "grad_norm_var": 0.005995941162109375, "learning_rate": 0.0001, "loss": 1.5714, "loss/crossentropy": 2.5114827156066895, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23155555129051208, "step": 10043 }, { "epoch": 0.14997872165687365, "grad_norm": 0.326171875, "grad_norm_var": 0.006077321370442709, "learning_rate": 0.0001, "loss": 1.521, "loss/crossentropy": 2.6106258630752563, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.23979949951171875, "step": 10044 }, { "epoch": 0.14999365382748864, "grad_norm": 0.271484375, "grad_norm_var": 0.006713342666625976, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.618637442588806, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19401369243860245, "step": 10045 }, { "epoch": 0.15000858599810363, "grad_norm": 0.341796875, "grad_norm_var": 0.006485748291015625, "learning_rate": 0.0001, "loss": 1.5879, "loss/crossentropy": 2.3733508586883545, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.21286970376968384, "step": 10046 }, { "epoch": 0.1500235181687186, "grad_norm": 0.498046875, "grad_norm_var": 0.007613118489583333, "learning_rate": 0.0001, "loss": 1.6393, "loss/crossentropy": 2.4199471473693848, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2525634989142418, "step": 10047 }, { "epoch": 0.15003845033933358, "grad_norm": 0.333984375, "grad_norm_var": 0.007405598958333333, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.6280083656311035, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1906472146511078, "step": 10048 }, { "epoch": 0.15005338250994857, "grad_norm": 0.337890625, "grad_norm_var": 0.00744932492574056, "learning_rate": 0.0001, "loss": 1.3814, "loss/crossentropy": 2.7301970720291138, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1900394856929779, "step": 10049 }, { "epoch": 0.15006831468056353, "grad_norm": 0.306640625, "grad_norm_var": 0.00763700803120931, "learning_rate": 0.0001, "loss": 1.5662, "loss/crossentropy": 2.5372118949890137, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2576006203889847, "step": 10050 }, { "epoch": 0.15008324685117852, "grad_norm": 0.333984375, "grad_norm_var": 0.0075647989908854164, "learning_rate": 0.0001, "loss": 1.547, "loss/crossentropy": 2.9493935108184814, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23055219650268555, "step": 10051 }, { "epoch": 0.1500981790217935, "grad_norm": 0.29296875, "grad_norm_var": 0.007637786865234375, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.7618097066879272, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19152862578630447, "step": 10052 }, { "epoch": 0.15011311119240847, "grad_norm": 0.388671875, "grad_norm_var": 0.0075555006663004555, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.3600860834121704, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.22249656915664673, "step": 10053 }, { "epoch": 0.15012804336302346, "grad_norm": 0.3203125, "grad_norm_var": 0.005533329645792643, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.4162697792053223, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.21699263900518417, "step": 10054 }, { "epoch": 0.15014297553363845, "grad_norm": 0.302734375, "grad_norm_var": 0.005682118733723958, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.7445703744888306, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21840079128742218, "step": 10055 }, { "epoch": 0.15015790770425344, "grad_norm": 0.30078125, "grad_norm_var": 0.0057803948720296225, "learning_rate": 0.0001, "loss": 1.5142, "loss/crossentropy": 2.580042004585266, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.23294514417648315, "step": 10056 }, { "epoch": 0.1501728398748684, "grad_norm": 0.314453125, "grad_norm_var": 0.005863300959269206, "learning_rate": 0.0001, "loss": 1.4877, "loss/crossentropy": 2.7181469202041626, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.210326686501503, "step": 10057 }, { "epoch": 0.1501877720454834, "grad_norm": 0.341796875, "grad_norm_var": 0.005865923563639323, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.4046086072921753, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2206786870956421, "step": 10058 }, { "epoch": 0.15020270421609838, "grad_norm": 0.35546875, "grad_norm_var": 0.002634429931640625, "learning_rate": 0.0001, "loss": 1.5967, "loss/crossentropy": 2.3196628093719482, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2334202080965042, "step": 10059 }, { "epoch": 0.15021763638671334, "grad_norm": 0.349609375, "grad_norm_var": 0.0026397705078125, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.5902713537216187, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2238026261329651, "step": 10060 }, { "epoch": 0.15023256855732833, "grad_norm": 0.302734375, "grad_norm_var": 0.0024281819661458332, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.600392460823059, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19173279404640198, "step": 10061 }, { "epoch": 0.15024750072794332, "grad_norm": 0.287109375, "grad_norm_var": 0.002593739827473958, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.572270154953003, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1902458295226097, "step": 10062 }, { "epoch": 0.1502624328985583, "grad_norm": 0.31640625, "grad_norm_var": 0.0007179101308186849, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.770729899406433, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20903309434652328, "step": 10063 }, { "epoch": 0.15027736506917327, "grad_norm": 0.298828125, "grad_norm_var": 0.0007488091786702473, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.6818877458572388, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19774997234344482, "step": 10064 }, { "epoch": 0.15029229723978826, "grad_norm": 0.30078125, "grad_norm_var": 0.0007557551066080729, "learning_rate": 0.0001, "loss": 1.3165, "loss/crossentropy": 2.7464256286621094, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1758958324790001, "step": 10065 }, { "epoch": 0.15030722941040325, "grad_norm": 0.296875, "grad_norm_var": 0.0007785638173421224, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.584527850151062, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20327311009168625, "step": 10066 }, { "epoch": 0.15032216158101822, "grad_norm": 0.375, "grad_norm_var": 0.0009658177693684896, "learning_rate": 0.0001, "loss": 1.528, "loss/crossentropy": 2.831634283065796, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22331985086202621, "step": 10067 }, { "epoch": 0.1503370937516332, "grad_norm": 0.33203125, "grad_norm_var": 0.0009124120076497396, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.6207737922668457, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1833956390619278, "step": 10068 }, { "epoch": 0.1503520259222482, "grad_norm": 0.5234375, "grad_norm_var": 0.003210051854451497, "learning_rate": 0.0001, "loss": 1.7952, "loss/crossentropy": 2.3720935583114624, "loss/fcd": 1.51171875, "loss/idx": 11.0, "loss/logits": 0.28348998725414276, "step": 10069 }, { "epoch": 0.15036695809286316, "grad_norm": 0.318359375, "grad_norm_var": 0.0032134373982747396, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.435214042663574, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2074592486023903, "step": 10070 }, { "epoch": 0.15038189026347815, "grad_norm": 0.333984375, "grad_norm_var": 0.003151384989420573, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.5637052059173584, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16369758546352386, "step": 10071 }, { "epoch": 0.15039682243409314, "grad_norm": 0.318359375, "grad_norm_var": 0.003092304865519206, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.5927761793136597, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.230958454310894, "step": 10072 }, { "epoch": 0.15041175460470813, "grad_norm": 0.390625, "grad_norm_var": 0.003242937723795573, "learning_rate": 0.0001, "loss": 1.6684, "loss/crossentropy": 2.3683507442474365, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.2504047378897667, "step": 10073 }, { "epoch": 0.1504266867753231, "grad_norm": 0.318359375, "grad_norm_var": 0.003271929423014323, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.6012682914733887, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1924227476119995, "step": 10074 }, { "epoch": 0.15044161894593808, "grad_norm": 0.291015625, "grad_norm_var": 0.0033867994944254557, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.693960428237915, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18733906745910645, "step": 10075 }, { "epoch": 0.15045655111655307, "grad_norm": 0.30078125, "grad_norm_var": 0.0034380594889322916, "learning_rate": 0.0001, "loss": 1.3818, "loss/crossentropy": 2.6185017824172974, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18262727558612823, "step": 10076 }, { "epoch": 0.15047148328716803, "grad_norm": 0.392578125, "grad_norm_var": 0.003597450256347656, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.531493902206421, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23949456214904785, "step": 10077 }, { "epoch": 0.15048641545778302, "grad_norm": 0.328125, "grad_norm_var": 0.0034288883209228514, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.74444580078125, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18774990737438202, "step": 10078 }, { "epoch": 0.150501347628398, "grad_norm": 0.283203125, "grad_norm_var": 0.0036010106404622396, "learning_rate": 0.0001, "loss": 1.2595, "loss/crossentropy": 2.748197913169861, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.157940074801445, "step": 10079 }, { "epoch": 0.15051627979901297, "grad_norm": 0.291015625, "grad_norm_var": 0.0036452611287434897, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.557100296020508, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20495715737342834, "step": 10080 }, { "epoch": 0.15053121196962796, "grad_norm": 0.302734375, "grad_norm_var": 0.003636026382446289, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.5797702074050903, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18538910895586014, "step": 10081 }, { "epoch": 0.15054614414024295, "grad_norm": 0.31640625, "grad_norm_var": 0.0035546461741129557, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.4957211017608643, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2232557088136673, "step": 10082 }, { "epoch": 0.15056107631085794, "grad_norm": 0.462890625, "grad_norm_var": 0.004465166727701823, "learning_rate": 0.0001, "loss": 1.6199, "loss/crossentropy": 2.660282611846924, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.26446981728076935, "step": 10083 }, { "epoch": 0.1505760084814729, "grad_norm": 0.28125, "grad_norm_var": 0.00470733642578125, "learning_rate": 0.0001, "loss": 1.4796, "loss/crossentropy": 2.50807523727417, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.22182802110910416, "step": 10084 }, { "epoch": 0.1505909406520879, "grad_norm": 0.341796875, "grad_norm_var": 0.002346658706665039, "learning_rate": 0.0001, "loss": 1.5341, "loss/crossentropy": 2.5385154485702515, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23722637444734573, "step": 10085 }, { "epoch": 0.15060587282270288, "grad_norm": 0.296875, "grad_norm_var": 0.002407328287760417, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.6778329610824585, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20087464898824692, "step": 10086 }, { "epoch": 0.15062080499331784, "grad_norm": 0.279296875, "grad_norm_var": 0.0025515238444010417, "learning_rate": 0.0001, "loss": 1.5028, "loss/crossentropy": 2.315244436264038, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.23328328132629395, "step": 10087 }, { "epoch": 0.15063573716393283, "grad_norm": 0.296875, "grad_norm_var": 0.00259855588277181, "learning_rate": 0.0001, "loss": 1.4128, "loss/crossentropy": 2.591732382774353, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20186376571655273, "step": 10088 }, { "epoch": 0.15065066933454782, "grad_norm": 0.322265625, "grad_norm_var": 0.002277565002441406, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.6878243684768677, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19575048238039017, "step": 10089 }, { "epoch": 0.1506656015051628, "grad_norm": 0.28515625, "grad_norm_var": 0.002349710464477539, "learning_rate": 0.0001, "loss": 1.4722, "loss/crossentropy": 2.5283809900283813, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.23004943132400513, "step": 10090 }, { "epoch": 0.15068053367577777, "grad_norm": 0.296875, "grad_norm_var": 0.00233154296875, "learning_rate": 0.0001, "loss": 1.3829, "loss/crossentropy": 2.6595752239227295, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19539885222911835, "step": 10091 }, { "epoch": 0.15069546584639276, "grad_norm": 0.3203125, "grad_norm_var": 0.002312151590983073, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.654887318611145, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1723458096385002, "step": 10092 }, { "epoch": 0.15071039801700775, "grad_norm": 0.326171875, "grad_norm_var": 0.0019327799479166667, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.354169249534607, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18112491816282272, "step": 10093 }, { "epoch": 0.1507253301876227, "grad_norm": 0.28125, "grad_norm_var": 0.0019846598307291666, "learning_rate": 0.0001, "loss": 1.3554, "loss/crossentropy": 2.496657371520996, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17566601932048798, "step": 10094 }, { "epoch": 0.1507402623582377, "grad_norm": 0.306640625, "grad_norm_var": 0.0019304911295572917, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.7560606002807617, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20390046387910843, "step": 10095 }, { "epoch": 0.1507551945288527, "grad_norm": 0.333984375, "grad_norm_var": 0.001920000712076823, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.635585904121399, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.19069591164588928, "step": 10096 }, { "epoch": 0.15077012669946765, "grad_norm": 0.306640625, "grad_norm_var": 0.001914215087890625, "learning_rate": 0.0001, "loss": 1.4052, "loss/crossentropy": 2.7382696866989136, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19427112489938736, "step": 10097 }, { "epoch": 0.15078505887008264, "grad_norm": 0.341796875, "grad_norm_var": 0.00195616086324056, "learning_rate": 0.0001, "loss": 1.3545, "loss/crossentropy": 2.874353051185608, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19046127796173096, "step": 10098 }, { "epoch": 0.15079999104069763, "grad_norm": 0.34375, "grad_norm_var": 0.0005338033040364583, "learning_rate": 0.0001, "loss": 1.6232, "loss/crossentropy": 2.359166741371155, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.24818424880504608, "step": 10099 }, { "epoch": 0.15081492321131262, "grad_norm": 0.388671875, "grad_norm_var": 0.0008423964182535808, "learning_rate": 0.0001, "loss": 1.6181, "loss/crossentropy": 2.764457106590271, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.25088562816381454, "step": 10100 }, { "epoch": 0.15082985538192759, "grad_norm": 0.349609375, "grad_norm_var": 0.0008722782135009765, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.585451602935791, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.21795449405908585, "step": 10101 }, { "epoch": 0.15084478755254258, "grad_norm": 0.33984375, "grad_norm_var": 0.0008708794911702473, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.704614996910095, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18375515937805176, "step": 10102 }, { "epoch": 0.15085971972315756, "grad_norm": 0.375, "grad_norm_var": 0.0009246190388997396, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.6898720264434814, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.20912319421768188, "step": 10103 }, { "epoch": 0.15087465189377253, "grad_norm": 0.31640625, "grad_norm_var": 0.000872802734375, "learning_rate": 0.0001, "loss": 1.4156, "loss/crossentropy": 2.7294154167175293, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20856616646051407, "step": 10104 }, { "epoch": 0.15088958406438752, "grad_norm": 0.2890625, "grad_norm_var": 0.0009633223215738932, "learning_rate": 0.0001, "loss": 1.4645, "loss/crossentropy": 2.6954846382141113, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21448996663093567, "step": 10105 }, { "epoch": 0.1509045162350025, "grad_norm": 0.353515625, "grad_norm_var": 0.0008915583292643229, "learning_rate": 0.0001, "loss": 1.5469, "loss/crossentropy": 2.5146595239639282, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.21101189404726028, "step": 10106 }, { "epoch": 0.1509194484056175, "grad_norm": 0.275390625, "grad_norm_var": 0.0010134220123291016, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.5666396617889404, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1746542900800705, "step": 10107 }, { "epoch": 0.15093438057623246, "grad_norm": 0.32421875, "grad_norm_var": 0.0010103702545166016, "learning_rate": 0.0001, "loss": 1.6383, "loss/crossentropy": 2.6520591974258423, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2515737935900688, "step": 10108 }, { "epoch": 0.15094931274684745, "grad_norm": 0.306640625, "grad_norm_var": 0.0010396162668863931, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.5561821460723877, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18700099736452103, "step": 10109 }, { "epoch": 0.15096424491746244, "grad_norm": 0.28515625, "grad_norm_var": 0.0010167280832926431, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.6219704151153564, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21497580409049988, "step": 10110 }, { "epoch": 0.1509791770880774, "grad_norm": 0.30859375, "grad_norm_var": 0.0010115941365559896, "learning_rate": 0.0001, "loss": 1.5757, "loss/crossentropy": 2.904939651489258, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2515237182378769, "step": 10111 }, { "epoch": 0.1509941092586924, "grad_norm": 0.37109375, "grad_norm_var": 0.0011302789052327475, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.4634041786193848, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2278931438922882, "step": 10112 }, { "epoch": 0.15100904142930738, "grad_norm": 0.30859375, "grad_norm_var": 0.0011245091756184897, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.3816951513290405, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2022719755768776, "step": 10113 }, { "epoch": 0.15102397359992234, "grad_norm": 0.369140625, "grad_norm_var": 0.0012148539225260416, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.5926624536514282, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21263794600963593, "step": 10114 }, { "epoch": 0.15103890577053733, "grad_norm": 0.294921875, "grad_norm_var": 0.0012843926747639974, "learning_rate": 0.0001, "loss": 1.407, "loss/crossentropy": 2.503525733947754, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19215254485607147, "step": 10115 }, { "epoch": 0.15105383794115232, "grad_norm": 0.296875, "grad_norm_var": 0.0010744730631510417, "learning_rate": 0.0001, "loss": 1.4461, "loss/crossentropy": 2.616706132888794, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20777733623981476, "step": 10116 }, { "epoch": 0.1510687701117673, "grad_norm": 0.31640625, "grad_norm_var": 0.001024484634399414, "learning_rate": 0.0001, "loss": 1.4464, "loss/crossentropy": 2.6581753492355347, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2003139778971672, "step": 10117 }, { "epoch": 0.15108370228238227, "grad_norm": 0.287109375, "grad_norm_var": 0.00106353759765625, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.6165027618408203, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18341590464115143, "step": 10118 }, { "epoch": 0.15109863445299726, "grad_norm": 0.2890625, "grad_norm_var": 0.0008649190266927083, "learning_rate": 0.0001, "loss": 1.4398, "loss/crossentropy": 2.6390976905822754, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2053992822766304, "step": 10119 }, { "epoch": 0.15111356662361225, "grad_norm": 0.298828125, "grad_norm_var": 0.0008739312489827474, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.507568120956421, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18275219947099686, "step": 10120 }, { "epoch": 0.1511284987942272, "grad_norm": 0.373046875, "grad_norm_var": 0.0010700861612955729, "learning_rate": 0.0001, "loss": 1.7878, "loss/crossentropy": 2.2884987592697144, "loss/fcd": 1.53515625, "loss/idx": 11.0, "loss/logits": 0.2526855170726776, "step": 10121 }, { "epoch": 0.1511434309648422, "grad_norm": 0.271484375, "grad_norm_var": 0.0010821024576822917, "learning_rate": 0.0001, "loss": 1.4149, "loss/crossentropy": 2.5567623376846313, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19614489376544952, "step": 10122 }, { "epoch": 0.1511583631354572, "grad_norm": 0.39453125, "grad_norm_var": 0.001403029759724935, "learning_rate": 0.0001, "loss": 1.4206, "loss/crossentropy": 2.3014129996299744, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18229667842388153, "step": 10123 }, { "epoch": 0.15117329530607218, "grad_norm": 0.314453125, "grad_norm_var": 0.001401519775390625, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.4482648372650146, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19842958450317383, "step": 10124 }, { "epoch": 0.15118822747668714, "grad_norm": 0.326171875, "grad_norm_var": 0.0013961156209309896, "learning_rate": 0.0001, "loss": 1.3641, "loss/crossentropy": 2.7378376722335815, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18054942041635513, "step": 10125 }, { "epoch": 0.15120315964730213, "grad_norm": 0.33203125, "grad_norm_var": 0.0013213475545247396, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.4271591901779175, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19616008549928665, "step": 10126 }, { "epoch": 0.15121809181791712, "grad_norm": 0.38671875, "grad_norm_var": 0.0015629450480143228, "learning_rate": 0.0001, "loss": 1.5662, "loss/crossentropy": 2.6775461435317993, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.23418185114860535, "step": 10127 }, { "epoch": 0.15123302398853208, "grad_norm": 0.341796875, "grad_norm_var": 0.0014439741770426431, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.345434308052063, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23479195684194565, "step": 10128 }, { "epoch": 0.15124795615914707, "grad_norm": 0.373046875, "grad_norm_var": 0.0015619913736979166, "learning_rate": 0.0001, "loss": 1.4074, "loss/crossentropy": 2.54134464263916, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19252432137727737, "step": 10129 }, { "epoch": 0.15126288832976206, "grad_norm": 0.302734375, "grad_norm_var": 0.0014830907185872397, "learning_rate": 0.0001, "loss": 1.3496, "loss/crossentropy": 2.786808729171753, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18553873896598816, "step": 10130 }, { "epoch": 0.15127782050037702, "grad_norm": 0.318359375, "grad_norm_var": 0.0014235814412434896, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.469981074333191, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1719360128045082, "step": 10131 }, { "epoch": 0.15129275267099201, "grad_norm": 0.294921875, "grad_norm_var": 0.0014315128326416015, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 2.475711226463318, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19517408311367035, "step": 10132 }, { "epoch": 0.151307684841607, "grad_norm": 0.3203125, "grad_norm_var": 0.0014273166656494141, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.603395104408264, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22495955973863602, "step": 10133 }, { "epoch": 0.151322617012222, "grad_norm": 0.333984375, "grad_norm_var": 0.001318216323852539, "learning_rate": 0.0001, "loss": 1.5234, "loss/crossentropy": 2.5853739976882935, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2108905240893364, "step": 10134 }, { "epoch": 0.15133754918283696, "grad_norm": 0.27734375, "grad_norm_var": 0.001389932632446289, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.7013758420944214, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1935456171631813, "step": 10135 }, { "epoch": 0.15135248135345195, "grad_norm": 0.298828125, "grad_norm_var": 0.001389932632446289, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.4788116216659546, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21583905071020126, "step": 10136 }, { "epoch": 0.15136741352406693, "grad_norm": 0.298828125, "grad_norm_var": 0.0012957096099853516, "learning_rate": 0.0001, "loss": 1.315, "loss/crossentropy": 2.7359412908554077, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17436382174491882, "step": 10137 }, { "epoch": 0.1513823456946819, "grad_norm": 0.3359375, "grad_norm_var": 0.00110321044921875, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.7997851371765137, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21032756567001343, "step": 10138 }, { "epoch": 0.1513972778652969, "grad_norm": 0.314453125, "grad_norm_var": 0.0007949670155843099, "learning_rate": 0.0001, "loss": 1.5209, "loss/crossentropy": 2.789363145828247, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23576918244361877, "step": 10139 }, { "epoch": 0.15141221003591188, "grad_norm": 0.287109375, "grad_norm_var": 0.0008732954661051433, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.6813353300094604, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20140212029218674, "step": 10140 }, { "epoch": 0.15142714220652684, "grad_norm": 0.310546875, "grad_norm_var": 0.0008786360422770183, "learning_rate": 0.0001, "loss": 1.4408, "loss/crossentropy": 2.6887229681015015, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21424736082553864, "step": 10141 }, { "epoch": 0.15144207437714183, "grad_norm": 0.31640625, "grad_norm_var": 0.0008697350819905599, "learning_rate": 0.0001, "loss": 1.2816, "loss/crossentropy": 2.5982900857925415, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1644216701388359, "step": 10142 }, { "epoch": 0.15145700654775682, "grad_norm": 0.318359375, "grad_norm_var": 0.0005487442016601562, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.489863634109497, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20110298693180084, "step": 10143 }, { "epoch": 0.1514719387183718, "grad_norm": 0.330078125, "grad_norm_var": 0.0005157470703125, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.3909460306167603, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21555277705192566, "step": 10144 }, { "epoch": 0.15148687088898677, "grad_norm": 0.28515625, "grad_norm_var": 0.0003118991851806641, "learning_rate": 0.0001, "loss": 1.399, "loss/crossentropy": 2.618376851081848, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19975481927394867, "step": 10145 }, { "epoch": 0.15150180305960176, "grad_norm": 0.33984375, "grad_norm_var": 0.00036716461181640625, "learning_rate": 0.0001, "loss": 1.4485, "loss/crossentropy": 2.77266001701355, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21408532559871674, "step": 10146 }, { "epoch": 0.15151673523021675, "grad_norm": 0.34765625, "grad_norm_var": 0.00044846534729003906, "learning_rate": 0.0001, "loss": 1.479, "loss/crossentropy": 2.3739945888519287, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.209456667304039, "step": 10147 }, { "epoch": 0.1515316674008317, "grad_norm": 0.375, "grad_norm_var": 0.0006550470987955729, "learning_rate": 0.0001, "loss": 1.5537, "loss/crossentropy": 2.4718782901763916, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2294815629720688, "step": 10148 }, { "epoch": 0.1515465995714467, "grad_norm": 0.330078125, "grad_norm_var": 0.0006638685862223307, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.3384612798690796, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18736077845096588, "step": 10149 }, { "epoch": 0.1515615317420617, "grad_norm": 0.3203125, "grad_norm_var": 0.000647735595703125, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 2.768918037414551, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.19090763479471207, "step": 10150 }, { "epoch": 0.15157646391267668, "grad_norm": 0.76171875, "grad_norm_var": 0.012694040934244791, "learning_rate": 0.0001, "loss": 1.4964, "loss/crossentropy": 2.863254189491272, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.1760484129190445, "step": 10151 }, { "epoch": 0.15159139608329164, "grad_norm": 0.271484375, "grad_norm_var": 0.012920570373535157, "learning_rate": 0.0001, "loss": 1.3159, "loss/crossentropy": 2.6653918027877808, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1713675633072853, "step": 10152 }, { "epoch": 0.15160632825390663, "grad_norm": 0.3125, "grad_norm_var": 0.012845468521118165, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.7080471515655518, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19135241210460663, "step": 10153 }, { "epoch": 0.15162126042452162, "grad_norm": 0.482421875, "grad_norm_var": 0.01396484375, "learning_rate": 0.0001, "loss": 1.6653, "loss/crossentropy": 2.3464457988739014, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.23172593116760254, "step": 10154 }, { "epoch": 0.15163619259513658, "grad_norm": 0.291015625, "grad_norm_var": 0.014130401611328124, "learning_rate": 0.0001, "loss": 1.3896, "loss/crossentropy": 2.5272918939590454, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19034866243600845, "step": 10155 }, { "epoch": 0.15165112476575157, "grad_norm": 0.32421875, "grad_norm_var": 0.013880650202433268, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.2895838022232056, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.19110164046287537, "step": 10156 }, { "epoch": 0.15166605693636656, "grad_norm": 0.40625, "grad_norm_var": 0.01385650634765625, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.6751275062561035, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18723811209201813, "step": 10157 }, { "epoch": 0.15168098910698152, "grad_norm": 0.31640625, "grad_norm_var": 0.01385650634765625, "learning_rate": 0.0001, "loss": 1.3868, "loss/crossentropy": 2.741644263267517, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19151423871517181, "step": 10158 }, { "epoch": 0.1516959212775965, "grad_norm": 0.29296875, "grad_norm_var": 0.014048878351847332, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.5264852046966553, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19056155532598495, "step": 10159 }, { "epoch": 0.1517108534482115, "grad_norm": 0.318359375, "grad_norm_var": 0.01410686175028483, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.4746217727661133, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.197329543530941, "step": 10160 }, { "epoch": 0.1517257856188265, "grad_norm": 0.30859375, "grad_norm_var": 0.013904301325480144, "learning_rate": 0.0001, "loss": 1.4526, "loss/crossentropy": 2.7115973234176636, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2025970220565796, "step": 10161 }, { "epoch": 0.15174071778944145, "grad_norm": 0.326171875, "grad_norm_var": 0.01395715077718099, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.6040624380111694, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20059548318386078, "step": 10162 }, { "epoch": 0.15175564996005644, "grad_norm": 0.466796875, "grad_norm_var": 0.014623244603474935, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 2.4780611991882324, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.26591939479112625, "step": 10163 }, { "epoch": 0.15177058213067143, "grad_norm": 0.333984375, "grad_norm_var": 0.014695676167805989, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.6949338912963867, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20526409149169922, "step": 10164 }, { "epoch": 0.1517855143012864, "grad_norm": 0.296875, "grad_norm_var": 0.014925622940063476, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.3580973148345947, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19467291980981827, "step": 10165 }, { "epoch": 0.15180044647190138, "grad_norm": 0.330078125, "grad_norm_var": 0.01487420399983724, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.5945141315460205, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17817923426628113, "step": 10166 }, { "epoch": 0.15181537864251637, "grad_norm": 0.390625, "grad_norm_var": 0.003851318359375, "learning_rate": 0.0001, "loss": 1.5689, "loss/crossentropy": 2.830217719078064, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.24465502798557281, "step": 10167 }, { "epoch": 0.15183031081313136, "grad_norm": 0.423828125, "grad_norm_var": 0.0038736343383789064, "learning_rate": 0.0001, "loss": 1.415, "loss/crossentropy": 2.7357760667800903, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18838845193386078, "step": 10168 }, { "epoch": 0.15184524298374633, "grad_norm": 0.361328125, "grad_norm_var": 0.0037699222564697267, "learning_rate": 0.0001, "loss": 1.5844, "loss/crossentropy": 2.5537887811660767, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2328043431043625, "step": 10169 }, { "epoch": 0.15186017515436132, "grad_norm": 0.34765625, "grad_norm_var": 0.002604103088378906, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.742166757583618, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22084493190050125, "step": 10170 }, { "epoch": 0.1518751073249763, "grad_norm": 0.2890625, "grad_norm_var": 0.0026186466217041015, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.523624897003174, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.223426915705204, "step": 10171 }, { "epoch": 0.15189003949559127, "grad_norm": 0.35546875, "grad_norm_var": 0.0025896549224853514, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.484149932861328, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19158197194337845, "step": 10172 }, { "epoch": 0.15190497166620626, "grad_norm": 0.330078125, "grad_norm_var": 0.0023584365844726562, "learning_rate": 0.0001, "loss": 1.4477, "loss/crossentropy": 2.696300983428955, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20941807329654694, "step": 10173 }, { "epoch": 0.15191990383682125, "grad_norm": 0.328125, "grad_norm_var": 0.002325439453125, "learning_rate": 0.0001, "loss": 1.4876, "loss/crossentropy": 2.5365147590637207, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21412861347198486, "step": 10174 }, { "epoch": 0.1519348360074362, "grad_norm": 0.337890625, "grad_norm_var": 0.0021474043528238933, "learning_rate": 0.0001, "loss": 1.4309, "loss/crossentropy": 2.6324676275253296, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.22386977076530457, "step": 10175 }, { "epoch": 0.1519497681780512, "grad_norm": 0.369140625, "grad_norm_var": 0.0021176497141520184, "learning_rate": 0.0001, "loss": 1.5668, "loss/crossentropy": 2.632393717765808, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.23474617302417755, "step": 10176 }, { "epoch": 0.1519647003486662, "grad_norm": 0.357421875, "grad_norm_var": 0.0019988377888997396, "learning_rate": 0.0001, "loss": 1.4312, "loss/crossentropy": 2.561187744140625, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19292673468589783, "step": 10177 }, { "epoch": 0.15197963251928118, "grad_norm": 0.345703125, "grad_norm_var": 0.001953379313151042, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.47946298122406, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20676856487989426, "step": 10178 }, { "epoch": 0.15199456468989614, "grad_norm": 0.345703125, "grad_norm_var": 0.001048723856608073, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.823045492172241, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22979740798473358, "step": 10179 }, { "epoch": 0.15200949686051113, "grad_norm": 0.51953125, "grad_norm_var": 0.0028924147288004556, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 2.4391560554504395, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.23463425040245056, "step": 10180 }, { "epoch": 0.15202442903112612, "grad_norm": 0.6015625, "grad_norm_var": 0.006210056940714518, "learning_rate": 0.0001, "loss": 1.8108, "loss/crossentropy": 2.7371315956115723, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.3068864494562149, "step": 10181 }, { "epoch": 0.15203936120174108, "grad_norm": 0.357421875, "grad_norm_var": 0.006085443496704102, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.609680414199829, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18920360505580902, "step": 10182 }, { "epoch": 0.15205429337235607, "grad_norm": 0.36328125, "grad_norm_var": 0.006089003880818685, "learning_rate": 0.0001, "loss": 1.6838, "loss/crossentropy": 2.534794807434082, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.27365975081920624, "step": 10183 }, { "epoch": 0.15206922554297106, "grad_norm": 0.3046875, "grad_norm_var": 0.0062334696451822914, "learning_rate": 0.0001, "loss": 1.3335, "loss/crossentropy": 2.4610235691070557, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17329761385917664, "step": 10184 }, { "epoch": 0.15208415771358605, "grad_norm": 0.3359375, "grad_norm_var": 0.00630186398824056, "learning_rate": 0.0001, "loss": 1.3083, "loss/crossentropy": 2.7598202228546143, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16772320866584778, "step": 10185 }, { "epoch": 0.152099089884201, "grad_norm": 0.37890625, "grad_norm_var": 0.0062779585520426435, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.4069007635116577, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19396332651376724, "step": 10186 }, { "epoch": 0.152114022054816, "grad_norm": 0.3671875, "grad_norm_var": 0.0058163801829020185, "learning_rate": 0.0001, "loss": 1.3647, "loss/crossentropy": 2.593541383743286, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1732480451464653, "step": 10187 }, { "epoch": 0.152128954225431, "grad_norm": 0.7265625, "grad_norm_var": 0.01346294085184733, "learning_rate": 0.0001, "loss": 1.5099, "loss/crossentropy": 2.5462650060653687, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22081434726715088, "step": 10188 }, { "epoch": 0.15214388639604595, "grad_norm": 0.275390625, "grad_norm_var": 0.014145644505818684, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.5909968614578247, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17380842566490173, "step": 10189 }, { "epoch": 0.15215881856666094, "grad_norm": 0.345703125, "grad_norm_var": 0.014009030659993489, "learning_rate": 0.0001, "loss": 1.5931, "loss/crossentropy": 2.6395069360733032, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.24541612714529037, "step": 10190 }, { "epoch": 0.15217375073727593, "grad_norm": 0.33984375, "grad_norm_var": 0.013994201024373373, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.782302141189575, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20583823323249817, "step": 10191 }, { "epoch": 0.1521886829078909, "grad_norm": 0.279296875, "grad_norm_var": 0.014818938573201497, "learning_rate": 0.0001, "loss": 1.3553, "loss/crossentropy": 2.677651047706604, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19901365041732788, "step": 10192 }, { "epoch": 0.15220361507850588, "grad_norm": 0.365234375, "grad_norm_var": 0.014788548151652018, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.6455873250961304, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.24656907469034195, "step": 10193 }, { "epoch": 0.15221854724912087, "grad_norm": 0.30859375, "grad_norm_var": 0.015097490946451823, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.3961336612701416, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18887241184711456, "step": 10194 }, { "epoch": 0.15223347941973586, "grad_norm": 0.39453125, "grad_norm_var": 0.014968347549438477, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.366592049598694, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.21941277384757996, "step": 10195 }, { "epoch": 0.15224841159035082, "grad_norm": 0.30859375, "grad_norm_var": 0.014147806167602538, "learning_rate": 0.0001, "loss": 1.4179, "loss/crossentropy": 2.3913997411727905, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19915670156478882, "step": 10196 }, { "epoch": 0.1522633437609658, "grad_norm": 0.328125, "grad_norm_var": 0.010680882136027019, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.756798028945923, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1882973164319992, "step": 10197 }, { "epoch": 0.1522782759315808, "grad_norm": 0.361328125, "grad_norm_var": 0.010679864883422851, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.733194589614868, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21086321026086807, "step": 10198 }, { "epoch": 0.15229320810219577, "grad_norm": 0.35546875, "grad_norm_var": 0.010681772232055664, "learning_rate": 0.0001, "loss": 1.6311, "loss/crossentropy": 2.3073121309280396, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.240485779941082, "step": 10199 }, { "epoch": 0.15230814027281075, "grad_norm": 0.349609375, "grad_norm_var": 0.010470835367838542, "learning_rate": 0.0001, "loss": 1.4971, "loss/crossentropy": 2.492119312286377, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.23151571303606033, "step": 10200 }, { "epoch": 0.15232307244342574, "grad_norm": 0.310546875, "grad_norm_var": 0.010605351130167643, "learning_rate": 0.0001, "loss": 1.4408, "loss/crossentropy": 2.6830320358276367, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2025175839662552, "step": 10201 }, { "epoch": 0.1523380046140407, "grad_norm": 0.29296875, "grad_norm_var": 0.010875304539998373, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.7503429651260376, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1699291169643402, "step": 10202 }, { "epoch": 0.1523529367846557, "grad_norm": 0.291015625, "grad_norm_var": 0.011132558186848959, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.7344523668289185, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18952197581529617, "step": 10203 }, { "epoch": 0.15236786895527069, "grad_norm": 0.318359375, "grad_norm_var": 0.001163339614868164, "learning_rate": 0.0001, "loss": 1.492, "loss/crossentropy": 2.541029930114746, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21468326449394226, "step": 10204 }, { "epoch": 0.15238280112588568, "grad_norm": 0.294921875, "grad_norm_var": 0.0010539849599202475, "learning_rate": 0.0001, "loss": 1.4312, "loss/crossentropy": 2.831676721572876, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2007324993610382, "step": 10205 }, { "epoch": 0.15239773329650064, "grad_norm": 0.310546875, "grad_norm_var": 0.0010471185048421224, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.531093120574951, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18887269496917725, "step": 10206 }, { "epoch": 0.15241266546711563, "grad_norm": 0.298828125, "grad_norm_var": 0.0010741551717122395, "learning_rate": 0.0001, "loss": 1.3659, "loss/crossentropy": 2.603582739830017, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1822567954659462, "step": 10207 }, { "epoch": 0.15242759763773062, "grad_norm": 0.287109375, "grad_norm_var": 0.0010324478149414062, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.3376026153564453, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2284959778189659, "step": 10208 }, { "epoch": 0.15244252980834558, "grad_norm": 0.384765625, "grad_norm_var": 0.001165008544921875, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.5697003602981567, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.23986150324344635, "step": 10209 }, { "epoch": 0.15245746197896057, "grad_norm": 0.30078125, "grad_norm_var": 0.00118560791015625, "learning_rate": 0.0001, "loss": 1.3945, "loss/crossentropy": 2.756333589553833, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19528796523809433, "step": 10210 }, { "epoch": 0.15247239414957556, "grad_norm": 0.33203125, "grad_norm_var": 0.00084381103515625, "learning_rate": 0.0001, "loss": 1.3877, "loss/crossentropy": 2.7017102241516113, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18456972390413284, "step": 10211 }, { "epoch": 0.15248732632019055, "grad_norm": 0.30859375, "grad_norm_var": 0.00084381103515625, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.608279824256897, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18909549713134766, "step": 10212 }, { "epoch": 0.1525022584908055, "grad_norm": 0.296875, "grad_norm_var": 0.0008722941080729166, "learning_rate": 0.0001, "loss": 1.548, "loss/crossentropy": 2.401172637939453, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.23939698934555054, "step": 10213 }, { "epoch": 0.1525171906614205, "grad_norm": 0.296875, "grad_norm_var": 0.0007626692454020183, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.6010496616363525, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19379384815692902, "step": 10214 }, { "epoch": 0.1525321228320355, "grad_norm": 0.318359375, "grad_norm_var": 0.0006451924641927083, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.7060872316360474, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21393036842346191, "step": 10215 }, { "epoch": 0.15254705500265045, "grad_norm": 0.3359375, "grad_norm_var": 0.0005883375803629557, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.701101779937744, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1952056586742401, "step": 10216 }, { "epoch": 0.15256198717326544, "grad_norm": 0.376953125, "grad_norm_var": 0.0008585453033447266, "learning_rate": 0.0001, "loss": 1.5107, "loss/crossentropy": 2.6877511739730835, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20601077377796173, "step": 10217 }, { "epoch": 0.15257691934388043, "grad_norm": 0.380859375, "grad_norm_var": 0.001079559326171875, "learning_rate": 0.0001, "loss": 1.6198, "loss/crossentropy": 2.5569745302200317, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.2135186642408371, "step": 10218 }, { "epoch": 0.1525918515144954, "grad_norm": 0.3125, "grad_norm_var": 0.0010230859120686849, "learning_rate": 0.0001, "loss": 1.3761, "loss/crossentropy": 2.671009063720703, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1885816603899002, "step": 10219 }, { "epoch": 0.15260678368511038, "grad_norm": 0.318359375, "grad_norm_var": 0.0010230859120686849, "learning_rate": 0.0001, "loss": 1.4531, "loss/crossentropy": 2.643783211708069, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19529996067285538, "step": 10220 }, { "epoch": 0.15262171585572537, "grad_norm": 0.32421875, "grad_norm_var": 0.0009703954060872396, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.671128511428833, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19780059903860092, "step": 10221 }, { "epoch": 0.15263664802634036, "grad_norm": 0.306640625, "grad_norm_var": 0.0009783426920572916, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.455000877380371, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.16934746503829956, "step": 10222 }, { "epoch": 0.15265158019695532, "grad_norm": 0.291015625, "grad_norm_var": 0.0010080973307291666, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.539136528968811, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18883124738931656, "step": 10223 }, { "epoch": 0.1526665123675703, "grad_norm": 0.29296875, "grad_norm_var": 0.0009820143381754558, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 2.6367310285568237, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16086933761835098, "step": 10224 }, { "epoch": 0.1526814445381853, "grad_norm": 0.30859375, "grad_norm_var": 0.0007235209147135416, "learning_rate": 0.0001, "loss": 1.3867, "loss/crossentropy": 2.562105178833008, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19135746359825134, "step": 10225 }, { "epoch": 0.15269637670880026, "grad_norm": 0.322265625, "grad_norm_var": 0.0007006168365478515, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.629238724708557, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2417856901884079, "step": 10226 }, { "epoch": 0.15271130887941525, "grad_norm": 0.349609375, "grad_norm_var": 0.0007476806640625, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.38352632522583, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22416967153549194, "step": 10227 }, { "epoch": 0.15272624105003024, "grad_norm": 0.30859375, "grad_norm_var": 0.0007476806640625, "learning_rate": 0.0001, "loss": 1.395, "loss/crossentropy": 2.7251423597335815, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18402718007564545, "step": 10228 }, { "epoch": 0.15274117322064523, "grad_norm": 0.396484375, "grad_norm_var": 0.001043558120727539, "learning_rate": 0.0001, "loss": 1.6457, "loss/crossentropy": 2.594710111618042, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.26285865157842636, "step": 10229 }, { "epoch": 0.1527561053912602, "grad_norm": 0.37890625, "grad_norm_var": 0.001129007339477539, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.9007047414779663, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20340211689472198, "step": 10230 }, { "epoch": 0.15277103756187518, "grad_norm": 0.32421875, "grad_norm_var": 0.0011199951171875, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.47553288936615, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18294049054384232, "step": 10231 }, { "epoch": 0.15278596973249017, "grad_norm": 0.306640625, "grad_norm_var": 0.0011621952056884766, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.66133451461792, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1824447065591812, "step": 10232 }, { "epoch": 0.15280090190310514, "grad_norm": 0.302734375, "grad_norm_var": 0.0010534763336181641, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.440102696418762, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22211288660764694, "step": 10233 }, { "epoch": 0.15281583407372012, "grad_norm": 0.29296875, "grad_norm_var": 0.0008996963500976562, "learning_rate": 0.0001, "loss": 1.5197, "loss/crossentropy": 2.4966580867767334, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2228325679898262, "step": 10234 }, { "epoch": 0.15283076624433511, "grad_norm": 0.2890625, "grad_norm_var": 0.0009607315063476562, "learning_rate": 0.0001, "loss": 1.3113, "loss/crossentropy": 2.3949592113494873, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1706673502922058, "step": 10235 }, { "epoch": 0.15284569841495008, "grad_norm": 0.271484375, "grad_norm_var": 0.0011056900024414063, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.641090750694275, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18337635695934296, "step": 10236 }, { "epoch": 0.15286063058556507, "grad_norm": 0.337890625, "grad_norm_var": 0.0011311690012613931, "learning_rate": 0.0001, "loss": 1.4813, "loss/crossentropy": 2.403713822364807, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20000377297401428, "step": 10237 }, { "epoch": 0.15287556275618006, "grad_norm": 0.306640625, "grad_norm_var": 0.0011311690012613931, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.3982051610946655, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19873756170272827, "step": 10238 }, { "epoch": 0.15289049492679505, "grad_norm": 0.283203125, "grad_norm_var": 0.001162576675415039, "learning_rate": 0.0001, "loss": 1.3803, "loss/crossentropy": 2.4990077018737793, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1810654178261757, "step": 10239 }, { "epoch": 0.15290542709741, "grad_norm": 0.31640625, "grad_norm_var": 0.0011217594146728516, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.7601377964019775, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17533553391695023, "step": 10240 }, { "epoch": 0.152920359268025, "grad_norm": 0.298828125, "grad_norm_var": 0.001140594482421875, "learning_rate": 0.0001, "loss": 1.3682, "loss/crossentropy": 2.687603712081909, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18464168906211853, "step": 10241 }, { "epoch": 0.15293529143864, "grad_norm": 0.359375, "grad_norm_var": 0.0012484073638916015, "learning_rate": 0.0001, "loss": 1.6978, "loss/crossentropy": 2.523091197013855, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.25249311327934265, "step": 10242 }, { "epoch": 0.15295022360925495, "grad_norm": 0.314453125, "grad_norm_var": 0.001187753677368164, "learning_rate": 0.0001, "loss": 1.4768, "loss/crossentropy": 2.4043627977371216, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.1994726061820984, "step": 10243 }, { "epoch": 0.15296515577986994, "grad_norm": 0.32421875, "grad_norm_var": 0.0011834303538004558, "learning_rate": 0.0001, "loss": 1.3883, "loss/crossentropy": 2.7381972074508667, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1929553747177124, "step": 10244 }, { "epoch": 0.15298008795048493, "grad_norm": 0.318359375, "grad_norm_var": 0.0007574558258056641, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.631132125854492, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.22311003506183624, "step": 10245 }, { "epoch": 0.15299502012109992, "grad_norm": 0.306640625, "grad_norm_var": 0.00045928955078125, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.429311513900757, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20139717310667038, "step": 10246 }, { "epoch": 0.15300995229171488, "grad_norm": 0.330078125, "grad_norm_var": 0.0004728794097900391, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.633161187171936, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20871511101722717, "step": 10247 }, { "epoch": 0.15302488446232987, "grad_norm": 0.271484375, "grad_norm_var": 0.0005655765533447265, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.393094062805176, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19149921089410782, "step": 10248 }, { "epoch": 0.15303981663294486, "grad_norm": 0.31640625, "grad_norm_var": 0.0005681355794270833, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.574242353439331, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20137441158294678, "step": 10249 }, { "epoch": 0.15305474880355982, "grad_norm": 0.3125, "grad_norm_var": 0.000551287333170573, "learning_rate": 0.0001, "loss": 1.4036, "loss/crossentropy": 2.8170539140701294, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19658052176237106, "step": 10250 }, { "epoch": 0.1530696809741748, "grad_norm": 0.3125, "grad_norm_var": 0.000520769755045573, "learning_rate": 0.0001, "loss": 1.4889, "loss/crossentropy": 2.6754212379455566, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21549060940742493, "step": 10251 }, { "epoch": 0.1530846131447898, "grad_norm": 0.310546875, "grad_norm_var": 0.0004088719685872396, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.7018603086471558, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19522935152053833, "step": 10252 }, { "epoch": 0.15309954531540476, "grad_norm": 0.302734375, "grad_norm_var": 0.00037282307942708334, "learning_rate": 0.0001, "loss": 1.263, "loss/crossentropy": 2.729594349861145, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16533683985471725, "step": 10253 }, { "epoch": 0.15311447748601975, "grad_norm": 0.30078125, "grad_norm_var": 0.0003787835439046224, "learning_rate": 0.0001, "loss": 1.4051, "loss/crossentropy": 2.452968955039978, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1941630244255066, "step": 10254 }, { "epoch": 0.15312940965663474, "grad_norm": 0.490234375, "grad_norm_var": 0.0022860050201416017, "learning_rate": 0.0001, "loss": 1.4295, "loss/crossentropy": 2.4824867248535156, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19514050334692, "step": 10255 }, { "epoch": 0.15314434182724973, "grad_norm": 0.361328125, "grad_norm_var": 0.0023660659790039062, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.4563395977020264, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.23519878834486008, "step": 10256 }, { "epoch": 0.1531592739978647, "grad_norm": 0.34765625, "grad_norm_var": 0.002332290013631185, "learning_rate": 0.0001, "loss": 1.4078, "loss/crossentropy": 2.6275439262390137, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2046351358294487, "step": 10257 }, { "epoch": 0.15317420616847968, "grad_norm": 0.29296875, "grad_norm_var": 0.002347421646118164, "learning_rate": 0.0001, "loss": 1.4291, "loss/crossentropy": 2.688283920288086, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2103913202881813, "step": 10258 }, { "epoch": 0.15318913833909467, "grad_norm": 0.31640625, "grad_norm_var": 0.002344703674316406, "learning_rate": 0.0001, "loss": 1.301, "loss/crossentropy": 2.781883120536804, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17212796211242676, "step": 10259 }, { "epoch": 0.15320407050970963, "grad_norm": 0.3125, "grad_norm_var": 0.00235595703125, "learning_rate": 0.0001, "loss": 1.5868, "loss/crossentropy": 2.371232032775879, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2743227481842041, "step": 10260 }, { "epoch": 0.15321900268032462, "grad_norm": 2.125, "grad_norm_var": 0.20470617612202963, "learning_rate": 0.0001, "loss": 1.9264, "loss/crossentropy": 2.8719159364700317, "loss/fcd": 1.52734375, "loss/idx": 11.0, "loss/logits": 0.39904214441776276, "step": 10261 }, { "epoch": 0.1532339348509396, "grad_norm": 0.4921875, "grad_norm_var": 0.2036053975423177, "learning_rate": 0.0001, "loss": 1.5751, "loss/crossentropy": 2.405090570449829, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2040001079440117, "step": 10262 }, { "epoch": 0.15324886702155457, "grad_norm": 0.294921875, "grad_norm_var": 0.2042434056599935, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.6403167247772217, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19899305701255798, "step": 10263 }, { "epoch": 0.15326379919216956, "grad_norm": 0.34375, "grad_norm_var": 0.2028737227121989, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 2.6768418550491333, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21949680149555206, "step": 10264 }, { "epoch": 0.15327873136278455, "grad_norm": 0.3671875, "grad_norm_var": 0.20211663246154785, "learning_rate": 0.0001, "loss": 1.5329, "loss/crossentropy": 2.528635025024414, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22039416432380676, "step": 10265 }, { "epoch": 0.15329366353339954, "grad_norm": 0.294921875, "grad_norm_var": 0.20247039794921876, "learning_rate": 0.0001, "loss": 1.3537, "loss/crossentropy": 2.5138388872146606, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18963447958230972, "step": 10266 }, { "epoch": 0.1533085957040145, "grad_norm": 0.326171875, "grad_norm_var": 0.20222395261128742, "learning_rate": 0.0001, "loss": 1.4828, "loss/crossentropy": 2.7097058296203613, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20935095101594925, "step": 10267 }, { "epoch": 0.1533235278746295, "grad_norm": 0.296875, "grad_norm_var": 0.20249888102213542, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.6179425716400146, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19802707433700562, "step": 10268 }, { "epoch": 0.15333846004524448, "grad_norm": 0.33203125, "grad_norm_var": 0.2019612471262614, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.5766396522521973, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2538573890924454, "step": 10269 }, { "epoch": 0.15335339221585945, "grad_norm": 0.337890625, "grad_norm_var": 0.2012796401977539, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.4701716899871826, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19888179004192352, "step": 10270 }, { "epoch": 0.15336832438647444, "grad_norm": 0.32421875, "grad_norm_var": 0.20229427019755045, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.619894862174988, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2112211138010025, "step": 10271 }, { "epoch": 0.15338325655708943, "grad_norm": 0.27734375, "grad_norm_var": 0.20370426177978515, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.547343611717224, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19529064744710922, "step": 10272 }, { "epoch": 0.15339818872770442, "grad_norm": 0.302734375, "grad_norm_var": 0.20439922014872233, "learning_rate": 0.0001, "loss": 1.4031, "loss/crossentropy": 2.6136447191238403, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1960354819893837, "step": 10273 }, { "epoch": 0.15341312089831938, "grad_norm": 0.318359375, "grad_norm_var": 0.20394236246744793, "learning_rate": 0.0001, "loss": 1.5084, "loss/crossentropy": 2.4873143434524536, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21540068089962006, "step": 10274 }, { "epoch": 0.15342805306893437, "grad_norm": 0.5234375, "grad_norm_var": 0.20317071278889973, "learning_rate": 0.0001, "loss": 1.5681, "loss/crossentropy": 2.674493432044983, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.20483656227588654, "step": 10275 }, { "epoch": 0.15344298523954936, "grad_norm": 0.314453125, "grad_norm_var": 0.20313401222229005, "learning_rate": 0.0001, "loss": 1.3225, "loss/crossentropy": 2.7055071592330933, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1857726350426674, "step": 10276 }, { "epoch": 0.15345791741016432, "grad_norm": 0.287109375, "grad_norm_var": 0.004881731669108073, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.55391263961792, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17632263898849487, "step": 10277 }, { "epoch": 0.1534728495807793, "grad_norm": 0.298828125, "grad_norm_var": 0.003284565607706706, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.4610174894332886, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20154869556427002, "step": 10278 }, { "epoch": 0.1534877817513943, "grad_norm": 0.29296875, "grad_norm_var": 0.003293291727701823, "learning_rate": 0.0001, "loss": 1.3399, "loss/crossentropy": 2.5978951454162598, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17977707833051682, "step": 10279 }, { "epoch": 0.15350271392200926, "grad_norm": 0.27734375, "grad_norm_var": 0.003424072265625, "learning_rate": 0.0001, "loss": 1.4279, "loss/crossentropy": 2.629034399986267, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20912303030490875, "step": 10280 }, { "epoch": 0.15351764609262425, "grad_norm": 0.392578125, "grad_norm_var": 0.003613138198852539, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.308270573616028, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2117723673582077, "step": 10281 }, { "epoch": 0.15353257826323924, "grad_norm": 0.326171875, "grad_norm_var": 0.003549559911092122, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.695164680480957, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.25220078229904175, "step": 10282 }, { "epoch": 0.15354751043385423, "grad_norm": 0.310546875, "grad_norm_var": 0.0035660902659098307, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.6808828115463257, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19772867858409882, "step": 10283 }, { "epoch": 0.1535624426044692, "grad_norm": 0.32421875, "grad_norm_var": 0.0035073439280192058, "learning_rate": 0.0001, "loss": 1.4704, "loss/crossentropy": 2.71970796585083, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21256455779075623, "step": 10284 }, { "epoch": 0.15357737477508418, "grad_norm": 0.3125, "grad_norm_var": 0.0035194238026936848, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.641803503036499, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19280324131250381, "step": 10285 }, { "epoch": 0.15359230694569917, "grad_norm": 0.28125, "grad_norm_var": 0.003632354736328125, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.3100011348724365, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1819794625043869, "step": 10286 }, { "epoch": 0.15360723911631413, "grad_norm": 0.35546875, "grad_norm_var": 0.003699493408203125, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.6826510429382324, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18409886956214905, "step": 10287 }, { "epoch": 0.15362217128692912, "grad_norm": 0.33984375, "grad_norm_var": 0.0035489400227864585, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.718430995941162, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.17144858837127686, "step": 10288 }, { "epoch": 0.1536371034575441, "grad_norm": 0.302734375, "grad_norm_var": 0.0035489400227864585, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.925034523010254, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20161166787147522, "step": 10289 }, { "epoch": 0.1536520356281591, "grad_norm": 0.28515625, "grad_norm_var": 0.0036632378896077475, "learning_rate": 0.0001, "loss": 1.4249, "loss/crossentropy": 2.205015540122986, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18663425743579865, "step": 10290 }, { "epoch": 0.15366696779877406, "grad_norm": 0.31640625, "grad_norm_var": 0.0009068648020426432, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.432114362716675, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18284866213798523, "step": 10291 }, { "epoch": 0.15368189996938905, "grad_norm": 0.83203125, "grad_norm_var": 0.017708778381347656, "learning_rate": 0.0001, "loss": 1.486, "loss/crossentropy": 2.4122507572174072, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.1930481344461441, "step": 10292 }, { "epoch": 0.15369683214000404, "grad_norm": 0.396484375, "grad_norm_var": 0.017598406473795573, "learning_rate": 0.0001, "loss": 1.6181, "loss/crossentropy": 2.69111430644989, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.25483036041259766, "step": 10293 }, { "epoch": 0.153711764310619, "grad_norm": 0.3203125, "grad_norm_var": 0.01747269630432129, "learning_rate": 0.0001, "loss": 1.445, "loss/crossentropy": 2.6978421211242676, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19894419610500336, "step": 10294 }, { "epoch": 0.153726696481234, "grad_norm": 0.291015625, "grad_norm_var": 0.017488861083984376, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.6454278230667114, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1929902657866478, "step": 10295 }, { "epoch": 0.15374162865184898, "grad_norm": 0.287109375, "grad_norm_var": 0.01739500363667806, "learning_rate": 0.0001, "loss": 1.3163, "loss/crossentropy": 2.6563780307769775, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17954544723033905, "step": 10296 }, { "epoch": 0.15375656082246394, "grad_norm": 0.3828125, "grad_norm_var": 0.017351531982421876, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.849358916282654, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23512587696313858, "step": 10297 }, { "epoch": 0.15377149299307893, "grad_norm": 0.466796875, "grad_norm_var": 0.018065643310546876, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.6370153427124023, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2167094200849533, "step": 10298 }, { "epoch": 0.15378642516369392, "grad_norm": 0.318359375, "grad_norm_var": 0.018015034993489585, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.650213837623596, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18803274631500244, "step": 10299 }, { "epoch": 0.1538013573343089, "grad_norm": 0.306640625, "grad_norm_var": 0.01812589963277181, "learning_rate": 0.0001, "loss": 1.3178, "loss/crossentropy": 2.7345287799835205, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17329742014408112, "step": 10300 }, { "epoch": 0.15381628950492388, "grad_norm": 0.337890625, "grad_norm_var": 0.01799799601236979, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.575053334236145, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20380962640047073, "step": 10301 }, { "epoch": 0.15383122167553887, "grad_norm": 0.421875, "grad_norm_var": 0.01768671671549479, "learning_rate": 0.0001, "loss": 1.701, "loss/crossentropy": 2.4008620977401733, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.25569725781679153, "step": 10302 }, { "epoch": 0.15384615384615385, "grad_norm": 0.3515625, "grad_norm_var": 0.017696571350097657, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.691684126853943, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20361968129873276, "step": 10303 }, { "epoch": 0.15386108601676882, "grad_norm": 0.302734375, "grad_norm_var": 0.017943302790323894, "learning_rate": 0.0001, "loss": 1.2575, "loss/crossentropy": 2.580960750579834, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.15987759083509445, "step": 10304 }, { "epoch": 0.1538760181873838, "grad_norm": 0.298828125, "grad_norm_var": 0.01797928810119629, "learning_rate": 0.0001, "loss": 1.4351, "loss/crossentropy": 2.3060734272003174, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.208571158349514, "step": 10305 }, { "epoch": 0.1538909503579988, "grad_norm": 0.3515625, "grad_norm_var": 0.017505884170532227, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.4560354948043823, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.24390608817338943, "step": 10306 }, { "epoch": 0.15390588252861379, "grad_norm": 0.27734375, "grad_norm_var": 0.017900705337524414, "learning_rate": 0.0001, "loss": 1.2472, "loss/crossentropy": 2.6356608867645264, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.16126340627670288, "step": 10307 }, { "epoch": 0.15392081469922875, "grad_norm": 0.36328125, "grad_norm_var": 0.002847909927368164, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.7714954614639282, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18063294142484665, "step": 10308 }, { "epoch": 0.15393574686984374, "grad_norm": 0.34765625, "grad_norm_var": 0.0026432673136393228, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.9945160150527954, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21667400002479553, "step": 10309 }, { "epoch": 0.15395067904045873, "grad_norm": 0.32421875, "grad_norm_var": 0.002634429931640625, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.1718602180480957, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21925388276576996, "step": 10310 }, { "epoch": 0.1539656112110737, "grad_norm": 0.37890625, "grad_norm_var": 0.0025507450103759766, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.7714418172836304, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.26038359105587006, "step": 10311 }, { "epoch": 0.15398054338168868, "grad_norm": 0.31640625, "grad_norm_var": 0.00237884521484375, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.602027416229248, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.179709330201149, "step": 10312 }, { "epoch": 0.15399547555230367, "grad_norm": 0.3046875, "grad_norm_var": 0.002383931477864583, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.6579867601394653, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.19059041142463684, "step": 10313 }, { "epoch": 0.15401040772291863, "grad_norm": 0.294921875, "grad_norm_var": 0.00136566162109375, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.5903559923171997, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20428360253572464, "step": 10314 }, { "epoch": 0.15402533989353362, "grad_norm": 0.353515625, "grad_norm_var": 0.0013833999633789062, "learning_rate": 0.0001, "loss": 1.548, "loss/crossentropy": 2.867442011833191, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23158246278762817, "step": 10315 }, { "epoch": 0.1540402720641486, "grad_norm": 0.296875, "grad_norm_var": 0.0014240105946858725, "learning_rate": 0.0001, "loss": 1.47, "loss/crossentropy": 2.4468144178390503, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.22387761622667313, "step": 10316 }, { "epoch": 0.1540552042347636, "grad_norm": 0.328125, "grad_norm_var": 0.0014231363932291667, "learning_rate": 0.0001, "loss": 1.4845, "loss/crossentropy": 2.6998482942581177, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21892239153385162, "step": 10317 }, { "epoch": 0.15407013640537856, "grad_norm": 0.33203125, "grad_norm_var": 0.0008513768513997396, "learning_rate": 0.0001, "loss": 1.3897, "loss/crossentropy": 2.5448538064956665, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.17878153175115585, "step": 10318 }, { "epoch": 0.15408506857599355, "grad_norm": 0.27734375, "grad_norm_var": 0.000946807861328125, "learning_rate": 0.0001, "loss": 1.35, "loss/crossentropy": 2.5145944356918335, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18981705605983734, "step": 10319 }, { "epoch": 0.15410000074660854, "grad_norm": 0.337890625, "grad_norm_var": 0.0009347915649414063, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.7018414735794067, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1700320839881897, "step": 10320 }, { "epoch": 0.1541149329172235, "grad_norm": 0.267578125, "grad_norm_var": 0.0011006037394205729, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.5878080129623413, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17207636684179306, "step": 10321 }, { "epoch": 0.1541298650878385, "grad_norm": 0.294921875, "grad_norm_var": 0.001078017552693685, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.707989454269409, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19099503010511398, "step": 10322 }, { "epoch": 0.15414479725845348, "grad_norm": 0.283203125, "grad_norm_var": 0.0010480244954427084, "learning_rate": 0.0001, "loss": 1.3579, "loss/crossentropy": 2.787068724632263, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1938195377588272, "step": 10323 }, { "epoch": 0.15415972942906844, "grad_norm": 0.263671875, "grad_norm_var": 0.001078017552693685, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.616403818130493, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19425395131111145, "step": 10324 }, { "epoch": 0.15417466159968343, "grad_norm": 0.365234375, "grad_norm_var": 0.0011794408162434896, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.5407822132110596, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22897964715957642, "step": 10325 }, { "epoch": 0.15418959377029842, "grad_norm": 0.3046875, "grad_norm_var": 0.0011759440104166666, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.5522466897964478, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1816677376627922, "step": 10326 }, { "epoch": 0.1542045259409134, "grad_norm": 0.333984375, "grad_norm_var": 0.0009043216705322266, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.24028217792511, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.25776753574609756, "step": 10327 }, { "epoch": 0.15421945811152837, "grad_norm": 0.326171875, "grad_norm_var": 0.0009190241495768229, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.633369565010071, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19889649003744125, "step": 10328 }, { "epoch": 0.15423439028214336, "grad_norm": 0.322265625, "grad_norm_var": 0.0009251753489176432, "learning_rate": 0.0001, "loss": 1.3428, "loss/crossentropy": 2.70590877532959, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18265659362077713, "step": 10329 }, { "epoch": 0.15424932245275835, "grad_norm": 0.345703125, "grad_norm_var": 0.0009747664133707682, "learning_rate": 0.0001, "loss": 1.4409, "loss/crossentropy": 2.5667755603790283, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19480863958597183, "step": 10330 }, { "epoch": 0.15426425462337331, "grad_norm": 0.302734375, "grad_norm_var": 0.0008722782135009765, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.547060251235962, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1883322224020958, "step": 10331 }, { "epoch": 0.1542791867939883, "grad_norm": 0.30859375, "grad_norm_var": 0.0008581638336181641, "learning_rate": 0.0001, "loss": 1.3421, "loss/crossentropy": 2.6341291666030884, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1741778552532196, "step": 10332 }, { "epoch": 0.1542941189646033, "grad_norm": 0.32421875, "grad_norm_var": 0.0008507887522379557, "learning_rate": 0.0001, "loss": 1.533, "loss/crossentropy": 2.3425278663635254, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22441940009593964, "step": 10333 }, { "epoch": 0.15430905113521828, "grad_norm": 0.302734375, "grad_norm_var": 0.0008257548014322917, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.514080762863159, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1786341592669487, "step": 10334 }, { "epoch": 0.15432398330583325, "grad_norm": 0.384765625, "grad_norm_var": 0.0010783990224202474, "learning_rate": 0.0001, "loss": 1.5729, "loss/crossentropy": 2.2799055576324463, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.24087198078632355, "step": 10335 }, { "epoch": 0.15433891547644824, "grad_norm": 0.359375, "grad_norm_var": 0.001167742411295573, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.663171887397766, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19391708076000214, "step": 10336 }, { "epoch": 0.15435384764706322, "grad_norm": 0.38671875, "grad_norm_var": 0.0012520949045817058, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.3941075801849365, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20622903108596802, "step": 10337 }, { "epoch": 0.1543687798176782, "grad_norm": 0.3046875, "grad_norm_var": 0.0012181599934895833, "learning_rate": 0.0001, "loss": 1.3419, "loss/crossentropy": 2.652997851371765, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18169458210468292, "step": 10338 }, { "epoch": 0.15438371198829318, "grad_norm": 0.3515625, "grad_norm_var": 0.0011185805002848308, "learning_rate": 0.0001, "loss": 1.4212, "loss/crossentropy": 2.5705251693725586, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19459453225135803, "step": 10339 }, { "epoch": 0.15439864415890817, "grad_norm": 0.333984375, "grad_norm_var": 0.0008015791575113932, "learning_rate": 0.0001, "loss": 1.4494, "loss/crossentropy": 2.721874952316284, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.23060675710439682, "step": 10340 }, { "epoch": 0.15441357632952313, "grad_norm": 0.3125, "grad_norm_var": 0.0007616678873697916, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.642345905303955, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17209315299987793, "step": 10341 }, { "epoch": 0.15442850850013812, "grad_norm": 0.31640625, "grad_norm_var": 0.0007282892862955729, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.669625163078308, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22236517816781998, "step": 10342 }, { "epoch": 0.1544434406707531, "grad_norm": 0.314453125, "grad_norm_var": 0.0007476806640625, "learning_rate": 0.0001, "loss": 1.4259, "loss/crossentropy": 2.612518310546875, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20329228788614273, "step": 10343 }, { "epoch": 0.1544583728413681, "grad_norm": 0.296875, "grad_norm_var": 0.0008203983306884766, "learning_rate": 0.0001, "loss": 1.3634, "loss/crossentropy": 2.611993432044983, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1836644485592842, "step": 10344 }, { "epoch": 0.15447330501198306, "grad_norm": 0.357421875, "grad_norm_var": 0.000865030288696289, "learning_rate": 0.0001, "loss": 1.4882, "loss/crossentropy": 2.5397112369537354, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20301465690135956, "step": 10345 }, { "epoch": 0.15448823718259805, "grad_norm": 0.310546875, "grad_norm_var": 0.0008753299713134766, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.7175872325897217, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20677922666072845, "step": 10346 }, { "epoch": 0.15450316935321304, "grad_norm": 0.283203125, "grad_norm_var": 0.0009681542714436848, "learning_rate": 0.0001, "loss": 1.2913, "loss/crossentropy": 2.4759888648986816, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16626334935426712, "step": 10347 }, { "epoch": 0.154518101523828, "grad_norm": 0.296875, "grad_norm_var": 0.00100706418355306, "learning_rate": 0.0001, "loss": 1.4595, "loss/crossentropy": 2.843685746192932, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21727902442216873, "step": 10348 }, { "epoch": 0.154533033694443, "grad_norm": 0.32421875, "grad_norm_var": 0.00100706418355306, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.4166722297668457, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1744464561343193, "step": 10349 }, { "epoch": 0.15454796586505798, "grad_norm": 0.318359375, "grad_norm_var": 0.0009712060292561849, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.5016250610351562, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1904124915599823, "step": 10350 }, { "epoch": 0.15456289803567297, "grad_norm": 0.328125, "grad_norm_var": 0.0007448832194010417, "learning_rate": 0.0001, "loss": 1.443, "loss/crossentropy": 2.7219637632369995, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21257665753364563, "step": 10351 }, { "epoch": 0.15457783020628793, "grad_norm": 0.294921875, "grad_norm_var": 0.0007065931955973307, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.702906847000122, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.18700268864631653, "step": 10352 }, { "epoch": 0.15459276237690292, "grad_norm": 0.302734375, "grad_norm_var": 0.00040791829427083335, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.6062432527542114, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18234646320343018, "step": 10353 }, { "epoch": 0.1546076945475179, "grad_norm": 0.3125, "grad_norm_var": 0.000400543212890625, "learning_rate": 0.0001, "loss": 1.3866, "loss/crossentropy": 2.5686439275741577, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.2068653106689453, "step": 10354 }, { "epoch": 0.15462262671813287, "grad_norm": 0.296875, "grad_norm_var": 0.00032755533854166664, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.566774845123291, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18980246782302856, "step": 10355 }, { "epoch": 0.15463755888874786, "grad_norm": 0.318359375, "grad_norm_var": 0.00029805501302083336, "learning_rate": 0.0001, "loss": 1.3995, "loss/crossentropy": 2.4307941198349, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18465910851955414, "step": 10356 }, { "epoch": 0.15465249105936285, "grad_norm": 0.33984375, "grad_norm_var": 0.0003483454386393229, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.454893112182617, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1902001053094864, "step": 10357 }, { "epoch": 0.1546674232299778, "grad_norm": 0.3046875, "grad_norm_var": 0.00035196940104166666, "learning_rate": 0.0001, "loss": 1.4299, "loss/crossentropy": 2.53447163105011, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19548363238573074, "step": 10358 }, { "epoch": 0.1546823554005928, "grad_norm": 0.3046875, "grad_norm_var": 0.00035538673400878904, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.5289303064346313, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18262497335672379, "step": 10359 }, { "epoch": 0.1546972875712078, "grad_norm": 0.28515625, "grad_norm_var": 0.00038743019104003906, "learning_rate": 0.0001, "loss": 1.3429, "loss/crossentropy": 2.785730242729187, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18276400864124298, "step": 10360 }, { "epoch": 0.15471221974182278, "grad_norm": 0.306640625, "grad_norm_var": 0.0002353509267171224, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.7090240716934204, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20171860605478287, "step": 10361 }, { "epoch": 0.15472715191243774, "grad_norm": 0.294921875, "grad_norm_var": 0.0002452691396077474, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.733282208442688, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21132808178663254, "step": 10362 }, { "epoch": 0.15474208408305273, "grad_norm": 0.365234375, "grad_norm_var": 0.0004054864247639974, "learning_rate": 0.0001, "loss": 1.6974, "loss/crossentropy": 2.570768117904663, "loss/fcd": 1.453125, "loss/idx": 11.0, "loss/logits": 0.2443201169371605, "step": 10363 }, { "epoch": 0.15475701625366772, "grad_norm": 0.330078125, "grad_norm_var": 0.00040683746337890627, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.4303618669509888, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18183254450559616, "step": 10364 }, { "epoch": 0.15477194842428268, "grad_norm": 0.359375, "grad_norm_var": 0.000531005859375, "learning_rate": 0.0001, "loss": 1.7748, "loss/crossentropy": 2.734029769897461, "loss/fcd": 1.484375, "loss/idx": 11.0, "loss/logits": 0.29043249040842056, "step": 10365 }, { "epoch": 0.15478688059489767, "grad_norm": 0.3984375, "grad_norm_var": 0.0009526411692301432, "learning_rate": 0.0001, "loss": 1.4629, "loss/crossentropy": 2.728504776954651, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.19332492351531982, "step": 10366 }, { "epoch": 0.15480181276551266, "grad_norm": 0.384765625, "grad_norm_var": 0.0012038548787434895, "learning_rate": 0.0001, "loss": 1.6492, "loss/crossentropy": 2.445830821990967, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.254707008600235, "step": 10367 }, { "epoch": 0.15481674493612765, "grad_norm": 0.30078125, "grad_norm_var": 0.00118254025777181, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5371644496917725, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22056324779987335, "step": 10368 }, { "epoch": 0.15483167710674262, "grad_norm": 0.3046875, "grad_norm_var": 0.0011768976847330729, "learning_rate": 0.0001, "loss": 1.4496, "loss/crossentropy": 2.568998336791992, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19964108616113663, "step": 10369 }, { "epoch": 0.1548466092773576, "grad_norm": 0.3828125, "grad_norm_var": 0.0013645807902018228, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.3024171590805054, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2082575559616089, "step": 10370 }, { "epoch": 0.1548615414479726, "grad_norm": 0.298828125, "grad_norm_var": 0.0013562361399332681, "learning_rate": 0.0001, "loss": 1.4917, "loss/crossentropy": 2.532908320426941, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2026112675666809, "step": 10371 }, { "epoch": 0.15487647361858756, "grad_norm": 0.486328125, "grad_norm_var": 0.0028598626454671224, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.6364376544952393, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2167535424232483, "step": 10372 }, { "epoch": 0.15489140578920255, "grad_norm": 0.28125, "grad_norm_var": 0.00307920773824056, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.5599303245544434, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17663279920816422, "step": 10373 }, { "epoch": 0.15490633795981754, "grad_norm": 0.30859375, "grad_norm_var": 0.0030634403228759766, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.3993648290634155, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1719723492860794, "step": 10374 }, { "epoch": 0.1549212701304325, "grad_norm": 0.376953125, "grad_norm_var": 0.0030781428019205728, "learning_rate": 0.0001, "loss": 1.5851, "loss/crossentropy": 2.8264453411102295, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.23348908126354218, "step": 10375 }, { "epoch": 0.1549362023010475, "grad_norm": 0.3125, "grad_norm_var": 0.0029192606608072916, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.677731156349182, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2083277925848961, "step": 10376 }, { "epoch": 0.15495113447166248, "grad_norm": 0.357421875, "grad_norm_var": 0.002832476298014323, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.4921308755874634, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.22900207340717316, "step": 10377 }, { "epoch": 0.15496606664227747, "grad_norm": 0.314453125, "grad_norm_var": 0.00272216796875, "learning_rate": 0.0001, "loss": 1.5107, "loss/crossentropy": 2.485390067100525, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22943679243326187, "step": 10378 }, { "epoch": 0.15498099881289243, "grad_norm": 0.3203125, "grad_norm_var": 0.0027430057525634766, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.542944312095642, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19191692769527435, "step": 10379 }, { "epoch": 0.15499593098350742, "grad_norm": 0.353515625, "grad_norm_var": 0.002731180191040039, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.7437559366226196, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.219091959297657, "step": 10380 }, { "epoch": 0.1550108631541224, "grad_norm": 0.271484375, "grad_norm_var": 0.0030609130859375, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.5780614614486694, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1655750647187233, "step": 10381 }, { "epoch": 0.15502579532473737, "grad_norm": 0.390625, "grad_norm_var": 0.0030047098795572915, "learning_rate": 0.0001, "loss": 1.8163, "loss/crossentropy": 2.347400188446045, "loss/fcd": 1.53125, "loss/idx": 11.0, "loss/logits": 0.28504670411348343, "step": 10382 }, { "epoch": 0.15504072749535236, "grad_norm": 0.3125, "grad_norm_var": 0.0029029687245686847, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.4836524724960327, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.18471767008304596, "step": 10383 }, { "epoch": 0.15505565966596735, "grad_norm": 0.296875, "grad_norm_var": 0.002922169367472331, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.575258731842041, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17691632360219955, "step": 10384 }, { "epoch": 0.1550705918365823, "grad_norm": 0.318359375, "grad_norm_var": 0.0028775533040364585, "learning_rate": 0.0001, "loss": 1.5552, "loss/crossentropy": 2.7295658588409424, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2231624722480774, "step": 10385 }, { "epoch": 0.1550855240071973, "grad_norm": 0.322265625, "grad_norm_var": 0.0027321974436442056, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.572913885116577, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20985451340675354, "step": 10386 }, { "epoch": 0.1551004561778123, "grad_norm": 0.2734375, "grad_norm_var": 0.002886962890625, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.6489099264144897, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2108648642897606, "step": 10387 }, { "epoch": 0.15511538834842728, "grad_norm": 0.3203125, "grad_norm_var": 0.001172494888305664, "learning_rate": 0.0001, "loss": 1.5521, "loss/crossentropy": 2.3489744663238525, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.21221821010112762, "step": 10388 }, { "epoch": 0.15513032051904224, "grad_norm": 0.29296875, "grad_norm_var": 0.0011194705963134765, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.572102904319763, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16800733655691147, "step": 10389 }, { "epoch": 0.15514525268965723, "grad_norm": 0.3046875, "grad_norm_var": 0.0011270999908447265, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.387369155883789, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19513728469610214, "step": 10390 }, { "epoch": 0.15516018486027222, "grad_norm": 0.337890625, "grad_norm_var": 0.0009319146474202473, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.3539541959762573, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18493019044399261, "step": 10391 }, { "epoch": 0.15517511703088718, "grad_norm": 0.294921875, "grad_norm_var": 0.0009658177693684896, "learning_rate": 0.0001, "loss": 1.2847, "loss/crossentropy": 2.6549450159072876, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17136912792921066, "step": 10392 }, { "epoch": 0.15519004920150217, "grad_norm": 0.3359375, "grad_norm_var": 0.0008806705474853516, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.561074376106262, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2026943936944008, "step": 10393 }, { "epoch": 0.15520498137211716, "grad_norm": 0.31640625, "grad_norm_var": 0.00088043212890625, "learning_rate": 0.0001, "loss": 1.4137, "loss/crossentropy": 2.481162667274475, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19102872908115387, "step": 10394 }, { "epoch": 0.15521991354273215, "grad_norm": 0.279296875, "grad_norm_var": 0.000964212417602539, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.6158618927001953, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18847212195396423, "step": 10395 }, { "epoch": 0.1552348457133471, "grad_norm": 0.30078125, "grad_norm_var": 0.00085906982421875, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.630620241165161, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19355785101652145, "step": 10396 }, { "epoch": 0.1552497778839621, "grad_norm": 0.72265625, "grad_norm_var": 0.011231470108032226, "learning_rate": 0.0001, "loss": 2.2166, "loss/crossentropy": 2.353147268295288, "loss/fcd": 1.89453125, "loss/idx": 11.0, "loss/logits": 0.3220770061016083, "step": 10397 }, { "epoch": 0.1552647100545771, "grad_norm": 0.330078125, "grad_norm_var": 0.011041768391927083, "learning_rate": 0.0001, "loss": 1.4921, "loss/crossentropy": 2.533868908882141, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2069055289030075, "step": 10398 }, { "epoch": 0.15527964222519205, "grad_norm": 0.30859375, "grad_norm_var": 0.011054420471191406, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.5873624086380005, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19258543848991394, "step": 10399 }, { "epoch": 0.15529457439580704, "grad_norm": 0.291015625, "grad_norm_var": 0.011086130142211914, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.3669735193252563, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1875211000442505, "step": 10400 }, { "epoch": 0.15530950656642203, "grad_norm": 0.283203125, "grad_norm_var": 0.011238336563110352, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.694767475128174, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2140190377831459, "step": 10401 }, { "epoch": 0.155324438737037, "grad_norm": 0.318359375, "grad_norm_var": 0.011244440078735351, "learning_rate": 0.0001, "loss": 1.4922, "loss/crossentropy": 2.5621650218963623, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21879685670137405, "step": 10402 }, { "epoch": 0.15533937090765199, "grad_norm": 0.29296875, "grad_norm_var": 0.01111601193745931, "learning_rate": 0.0001, "loss": 1.3878, "loss/crossentropy": 2.683125615119934, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.20417076349258423, "step": 10403 }, { "epoch": 0.15535430307826698, "grad_norm": 0.34765625, "grad_norm_var": 0.01111601193745931, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.712531805038452, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19923298805952072, "step": 10404 }, { "epoch": 0.15536923524888197, "grad_norm": 0.302734375, "grad_norm_var": 0.011067454020182292, "learning_rate": 0.0001, "loss": 1.3955, "loss/crossentropy": 2.5143038034439087, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18457522988319397, "step": 10405 }, { "epoch": 0.15538416741949693, "grad_norm": 0.279296875, "grad_norm_var": 0.011211887995402018, "learning_rate": 0.0001, "loss": 1.3192, "loss/crossentropy": 2.5755081176757812, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17471083253622055, "step": 10406 }, { "epoch": 0.15539909959011192, "grad_norm": 0.3515625, "grad_norm_var": 0.011230913798014323, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.594753623008728, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22079790383577347, "step": 10407 }, { "epoch": 0.1554140317607269, "grad_norm": 0.328125, "grad_norm_var": 0.01112364133199056, "learning_rate": 0.0001, "loss": 1.5212, "loss/crossentropy": 2.570824146270752, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.24779749661684036, "step": 10408 }, { "epoch": 0.15542896393134187, "grad_norm": 0.2890625, "grad_norm_var": 0.011266311009724935, "learning_rate": 0.0001, "loss": 1.3679, "loss/crossentropy": 2.741608738899231, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18821732699871063, "step": 10409 }, { "epoch": 0.15544389610195686, "grad_norm": 0.28125, "grad_norm_var": 0.011425383885701497, "learning_rate": 0.0001, "loss": 1.4727, "loss/crossentropy": 2.480408787727356, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22265025973320007, "step": 10410 }, { "epoch": 0.15545882827257185, "grad_norm": 0.32421875, "grad_norm_var": 0.011237843831380209, "learning_rate": 0.0001, "loss": 1.4495, "loss/crossentropy": 2.443572163581848, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21514973044395447, "step": 10411 }, { "epoch": 0.15547376044318684, "grad_norm": 0.3515625, "grad_norm_var": 0.011170895894368489, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.471922755241394, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.18626605719327927, "step": 10412 }, { "epoch": 0.1554886926138018, "grad_norm": 0.298828125, "grad_norm_var": 0.0006407260894775391, "learning_rate": 0.0001, "loss": 1.3964, "loss/crossentropy": 2.68536114692688, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20502112805843353, "step": 10413 }, { "epoch": 0.1555036247844168, "grad_norm": 0.357421875, "grad_norm_var": 0.0007564385732014973, "learning_rate": 0.0001, "loss": 1.3615, "loss/crossentropy": 2.525555729866028, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1778968870639801, "step": 10414 }, { "epoch": 0.15551855695503178, "grad_norm": 0.302734375, "grad_norm_var": 0.0007619222005208333, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.478036880493164, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1923869103193283, "step": 10415 }, { "epoch": 0.15553348912564674, "grad_norm": 0.287109375, "grad_norm_var": 0.0007740656534830729, "learning_rate": 0.0001, "loss": 1.2785, "loss/crossentropy": 2.5520130395889282, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16131959110498428, "step": 10416 }, { "epoch": 0.15554842129626173, "grad_norm": 0.294921875, "grad_norm_var": 0.0007372538248697917, "learning_rate": 0.0001, "loss": 1.4161, "loss/crossentropy": 2.6943694353103638, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20904292166233063, "step": 10417 }, { "epoch": 0.15556335346687672, "grad_norm": 0.310546875, "grad_norm_var": 0.0007354736328125, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.567999839782715, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2052145153284073, "step": 10418 }, { "epoch": 0.15557828563749168, "grad_norm": 0.296875, "grad_norm_var": 0.0007262547810872396, "learning_rate": 0.0001, "loss": 1.4137, "loss/crossentropy": 2.6762633323669434, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19889705628156662, "step": 10419 }, { "epoch": 0.15559321780810667, "grad_norm": 0.3203125, "grad_norm_var": 0.0006457010904947917, "learning_rate": 0.0001, "loss": 1.3964, "loss/crossentropy": 2.4446141719818115, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18935374170541763, "step": 10420 }, { "epoch": 0.15560814997872166, "grad_norm": 0.310546875, "grad_norm_var": 0.000640869140625, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.7788615226745605, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18916506320238113, "step": 10421 }, { "epoch": 0.15562308214933665, "grad_norm": 0.3046875, "grad_norm_var": 0.0005720615386962891, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.7517929077148438, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1947510540485382, "step": 10422 }, { "epoch": 0.1556380143199516, "grad_norm": 0.333984375, "grad_norm_var": 0.000501251220703125, "learning_rate": 0.0001, "loss": 1.4356, "loss/crossentropy": 2.8001002073287964, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19341668486595154, "step": 10423 }, { "epoch": 0.1556529464905666, "grad_norm": 0.306640625, "grad_norm_var": 0.00048394203186035155, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.831053137779236, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1811753362417221, "step": 10424 }, { "epoch": 0.1556678786611816, "grad_norm": 0.291015625, "grad_norm_var": 0.00047855377197265626, "learning_rate": 0.0001, "loss": 1.3936, "loss/crossentropy": 2.6560239791870117, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19824127852916718, "step": 10425 }, { "epoch": 0.15568281083179655, "grad_norm": 0.298828125, "grad_norm_var": 0.00042862892150878905, "learning_rate": 0.0001, "loss": 1.3579, "loss/crossentropy": 2.6245471239089966, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1821025088429451, "step": 10426 }, { "epoch": 0.15569774300241154, "grad_norm": 0.287109375, "grad_norm_var": 0.00045369466145833335, "learning_rate": 0.0001, "loss": 1.3412, "loss/crossentropy": 2.473367691040039, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1849280297756195, "step": 10427 }, { "epoch": 0.15571267517302653, "grad_norm": 0.263671875, "grad_norm_var": 0.0004443963368733724, "learning_rate": 0.0001, "loss": 1.2847, "loss/crossentropy": 2.291164755821228, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1558341085910797, "step": 10428 }, { "epoch": 0.15572760734364152, "grad_norm": 0.302734375, "grad_norm_var": 0.00044261614481608074, "learning_rate": 0.0001, "loss": 1.3212, "loss/crossentropy": 2.696122169494629, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1766430214047432, "step": 10429 }, { "epoch": 0.15574253951425648, "grad_norm": 0.337890625, "grad_norm_var": 0.00032817522684733074, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.555808901786804, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2269720733165741, "step": 10430 }, { "epoch": 0.15575747168487147, "grad_norm": 0.31640625, "grad_norm_var": 0.00033919016520182293, "learning_rate": 0.0001, "loss": 1.5314, "loss/crossentropy": 2.7784388065338135, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23449544608592987, "step": 10431 }, { "epoch": 0.15577240385548646, "grad_norm": 0.306640625, "grad_norm_var": 0.0003191630045572917, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.623862147331238, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20449922233819962, "step": 10432 }, { "epoch": 0.15578733602610143, "grad_norm": 0.31640625, "grad_norm_var": 0.00031863848368326824, "learning_rate": 0.0001, "loss": 1.3793, "loss/crossentropy": 2.348495125770569, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1878550574183464, "step": 10433 }, { "epoch": 0.15580226819671641, "grad_norm": 0.341796875, "grad_norm_var": 0.0003964583079020182, "learning_rate": 0.0001, "loss": 1.5895, "loss/crossentropy": 2.3124316930770874, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.23009414970874786, "step": 10434 }, { "epoch": 0.1558172003673314, "grad_norm": 0.296875, "grad_norm_var": 0.0003964583079020182, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.486102342605591, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20196333527565002, "step": 10435 }, { "epoch": 0.15583213253794637, "grad_norm": 0.35546875, "grad_norm_var": 0.0005292097727457682, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.437725067138672, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.18508590757846832, "step": 10436 }, { "epoch": 0.15584706470856136, "grad_norm": 0.306640625, "grad_norm_var": 0.0005302270253499349, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.581956624984741, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21142540872097015, "step": 10437 }, { "epoch": 0.15586199687917635, "grad_norm": 0.390625, "grad_norm_var": 0.000926065444946289, "learning_rate": 0.0001, "loss": 1.5464, "loss/crossentropy": 2.2723960876464844, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.1869937777519226, "step": 10438 }, { "epoch": 0.15587692904979134, "grad_norm": 0.33203125, "grad_norm_var": 0.0009215672810872396, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.4960626363754272, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2245849072933197, "step": 10439 }, { "epoch": 0.1558918612204063, "grad_norm": 0.345703125, "grad_norm_var": 0.0009698867797851562, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.698475480079651, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2033778801560402, "step": 10440 }, { "epoch": 0.1559067933910213, "grad_norm": 0.26171875, "grad_norm_var": 0.0011293888092041016, "learning_rate": 0.0001, "loss": 1.3186, "loss/crossentropy": 2.449510335922241, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17017658799886703, "step": 10441 }, { "epoch": 0.15592172556163628, "grad_norm": 0.291015625, "grad_norm_var": 0.0011513868967692057, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.2327771186828613, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21015910804271698, "step": 10442 }, { "epoch": 0.15593665773225124, "grad_norm": 0.30078125, "grad_norm_var": 0.0011107762654622396, "learning_rate": 0.0001, "loss": 1.4744, "loss/crossentropy": 2.5064637660980225, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21661809086799622, "step": 10443 }, { "epoch": 0.15595158990286623, "grad_norm": 0.447265625, "grad_norm_var": 0.0019205729166666666, "learning_rate": 0.0001, "loss": 1.6214, "loss/crossentropy": 2.360652208328247, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.2229616791009903, "step": 10444 }, { "epoch": 0.15596652207348122, "grad_norm": 0.265625, "grad_norm_var": 0.002132272720336914, "learning_rate": 0.0001, "loss": 1.2165, "loss/crossentropy": 2.6254135370254517, "loss/fcd": 1.06640625, "loss/idx": 11.0, "loss/logits": 0.15006303787231445, "step": 10445 }, { "epoch": 0.15598145424409618, "grad_norm": 0.30859375, "grad_norm_var": 0.002138710021972656, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.7131781578063965, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.1800481304526329, "step": 10446 }, { "epoch": 0.15599638641471117, "grad_norm": 0.3203125, "grad_norm_var": 0.0021357218424479167, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.5466631650924683, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17025479674339294, "step": 10447 }, { "epoch": 0.15601131858532616, "grad_norm": 0.470703125, "grad_norm_var": 0.003433481852213542, "learning_rate": 0.0001, "loss": 1.8397, "loss/crossentropy": 2.8169907331466675, "loss/fcd": 1.55859375, "loss/idx": 11.0, "loss/logits": 0.2811042070388794, "step": 10448 }, { "epoch": 0.15602625075594115, "grad_norm": 0.275390625, "grad_norm_var": 0.0036374251047770183, "learning_rate": 0.0001, "loss": 1.3554, "loss/crossentropy": 2.6008460521698, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.183564230799675, "step": 10449 }, { "epoch": 0.1560411829265561, "grad_norm": 0.46484375, "grad_norm_var": 0.0047459284464518225, "learning_rate": 0.0001, "loss": 1.5536, "loss/crossentropy": 2.5993733406066895, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22157013416290283, "step": 10450 }, { "epoch": 0.1560561150971711, "grad_norm": 0.3515625, "grad_norm_var": 0.004621315002441406, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.6862833499908447, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.15983982384204865, "step": 10451 }, { "epoch": 0.1560710472677861, "grad_norm": 0.30859375, "grad_norm_var": 0.004680824279785156, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.5123379230499268, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.16748788952827454, "step": 10452 }, { "epoch": 0.15608597943840105, "grad_norm": 0.3046875, "grad_norm_var": 0.004689772923787435, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.8150604963302612, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.2072075456380844, "step": 10453 }, { "epoch": 0.15610091160901604, "grad_norm": 0.34375, "grad_norm_var": 0.00451048215230306, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.786823868751526, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20647212117910385, "step": 10454 }, { "epoch": 0.15611584377963103, "grad_norm": 0.314453125, "grad_norm_var": 0.004541524251302083, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.788770318031311, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21673277765512466, "step": 10455 }, { "epoch": 0.15613077595024602, "grad_norm": 0.31640625, "grad_norm_var": 0.004557021458943685, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.5931055545806885, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16872859001159668, "step": 10456 }, { "epoch": 0.15614570812086098, "grad_norm": 0.3046875, "grad_norm_var": 0.004257694880167643, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.6204384565353394, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1848442703485489, "step": 10457 }, { "epoch": 0.15616064029147597, "grad_norm": 0.34375, "grad_norm_var": 0.004109636942545573, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.4315730333328247, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22393450140953064, "step": 10458 }, { "epoch": 0.15617557246209096, "grad_norm": 0.302734375, "grad_norm_var": 0.004099639256795248, "learning_rate": 0.0001, "loss": 1.3879, "loss/crossentropy": 2.6442575454711914, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19261708855628967, "step": 10459 }, { "epoch": 0.15619050463270592, "grad_norm": 0.330078125, "grad_norm_var": 0.00328520139058431, "learning_rate": 0.0001, "loss": 1.4384, "loss/crossentropy": 2.9784305095672607, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2079249620437622, "step": 10460 }, { "epoch": 0.1562054368033209, "grad_norm": 0.28515625, "grad_norm_var": 0.0031338850657145183, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.5927298069000244, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18649151176214218, "step": 10461 }, { "epoch": 0.1562203689739359, "grad_norm": 0.291015625, "grad_norm_var": 0.0032129923502604166, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.762510657310486, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2135869413614273, "step": 10462 }, { "epoch": 0.15623530114455086, "grad_norm": 0.291015625, "grad_norm_var": 0.003316227595011393, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.7582536935806274, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17859750986099243, "step": 10463 }, { "epoch": 0.15625023331516585, "grad_norm": 0.3203125, "grad_norm_var": 0.0019320170084635416, "learning_rate": 0.0001, "loss": 1.549, "loss/crossentropy": 2.7329691648483276, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.23646565526723862, "step": 10464 }, { "epoch": 0.15626516548578084, "grad_norm": 0.3046875, "grad_norm_var": 0.0018044630686442056, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.5620765686035156, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1769922822713852, "step": 10465 }, { "epoch": 0.15628009765639583, "grad_norm": 0.263671875, "grad_norm_var": 0.000545501708984375, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.7264941930770874, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.15409034490585327, "step": 10466 }, { "epoch": 0.1562950298270108, "grad_norm": 0.302734375, "grad_norm_var": 0.0004306634267171224, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.615951657295227, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22421014308929443, "step": 10467 }, { "epoch": 0.15630996199762578, "grad_norm": 0.2734375, "grad_norm_var": 0.0005050500233968099, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.8001643419265747, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20034505426883698, "step": 10468 }, { "epoch": 0.15632489416824077, "grad_norm": 0.30078125, "grad_norm_var": 0.0005065759023030598, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.5556026697158813, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20188873261213303, "step": 10469 }, { "epoch": 0.15633982633885574, "grad_norm": 0.30859375, "grad_norm_var": 0.0004047234853108724, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.598856806755066, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1950744166970253, "step": 10470 }, { "epoch": 0.15635475850947073, "grad_norm": 0.33984375, "grad_norm_var": 0.0004826227823893229, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.9407453536987305, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20369857549667358, "step": 10471 }, { "epoch": 0.15636969068008572, "grad_norm": 0.3046875, "grad_norm_var": 0.00047327677408854165, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.685102105140686, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2085571438074112, "step": 10472 }, { "epoch": 0.1563846228507007, "grad_norm": 0.30078125, "grad_norm_var": 0.00047397613525390625, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.6183359622955322, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17314893752336502, "step": 10473 }, { "epoch": 0.15639955502131567, "grad_norm": 0.287109375, "grad_norm_var": 0.00037395159403483074, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.531623959541321, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18591603636741638, "step": 10474 }, { "epoch": 0.15641448719193066, "grad_norm": 0.380859375, "grad_norm_var": 0.0007795810699462891, "learning_rate": 0.0001, "loss": 1.5533, "loss/crossentropy": 2.5639203786849976, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.21732208877801895, "step": 10475 }, { "epoch": 0.15642941936254565, "grad_norm": 0.337890625, "grad_norm_var": 0.0008092085520426432, "learning_rate": 0.0001, "loss": 1.3454, "loss/crossentropy": 2.6586601734161377, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1774337887763977, "step": 10476 }, { "epoch": 0.1564443515331606, "grad_norm": 0.3203125, "grad_norm_var": 0.0007897535959879557, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.5668439865112305, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1886741891503334, "step": 10477 }, { "epoch": 0.1564592837037756, "grad_norm": 0.27734375, "grad_norm_var": 0.000832366943359375, "learning_rate": 0.0001, "loss": 1.2793, "loss/crossentropy": 2.428937792778015, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16993192583322525, "step": 10478 }, { "epoch": 0.1564742158743906, "grad_norm": 0.390625, "grad_norm_var": 0.0012384891510009766, "learning_rate": 0.0001, "loss": 1.3606, "loss/crossentropy": 2.325237274169922, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1808869019150734, "step": 10479 }, { "epoch": 0.15648914804500555, "grad_norm": 1.2265625, "grad_norm_var": 0.05340981483459473, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.2667349576950073, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.21313519030809402, "step": 10480 }, { "epoch": 0.15650408021562054, "grad_norm": 0.353515625, "grad_norm_var": 0.053133646647135414, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.547156810760498, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19086229801177979, "step": 10481 }, { "epoch": 0.15651901238623553, "grad_norm": 0.427734375, "grad_norm_var": 0.052423350016276044, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.5091997385025024, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.193517804145813, "step": 10482 }, { "epoch": 0.15653394455685052, "grad_norm": 0.28515625, "grad_norm_var": 0.05263148943583171, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.556432843208313, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1992523968219757, "step": 10483 }, { "epoch": 0.15654887672746548, "grad_norm": 0.3046875, "grad_norm_var": 0.05223933855692545, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.6611363887786865, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16702182590961456, "step": 10484 }, { "epoch": 0.15656380889808047, "grad_norm": 0.41015625, "grad_norm_var": 0.051771148045857744, "learning_rate": 0.0001, "loss": 1.6644, "loss/crossentropy": 2.615452766418457, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.2737964689731598, "step": 10485 }, { "epoch": 0.15657874106869546, "grad_norm": 0.28125, "grad_norm_var": 0.05211828549702962, "learning_rate": 0.0001, "loss": 1.2504, "loss/crossentropy": 2.683281660079956, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.1605444848537445, "step": 10486 }, { "epoch": 0.15659367323931042, "grad_norm": 0.3203125, "grad_norm_var": 0.05227087338765462, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.61273193359375, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22593318670988083, "step": 10487 }, { "epoch": 0.1566086054099254, "grad_norm": 0.283203125, "grad_norm_var": 0.052538553873697914, "learning_rate": 0.0001, "loss": 1.3034, "loss/crossentropy": 2.6022342443466187, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17060667276382446, "step": 10488 }, { "epoch": 0.1566235375805404, "grad_norm": 0.291015625, "grad_norm_var": 0.05265641212463379, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.5389596223831177, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20574860274791718, "step": 10489 }, { "epoch": 0.1566384697511554, "grad_norm": 0.30078125, "grad_norm_var": 0.052487627665201826, "learning_rate": 0.0001, "loss": 1.4249, "loss/crossentropy": 2.6230216026306152, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20614707469940186, "step": 10490 }, { "epoch": 0.15665340192177035, "grad_norm": 0.294921875, "grad_norm_var": 0.053019142150878905, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.631189227104187, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17731845378875732, "step": 10491 }, { "epoch": 0.15666833409238534, "grad_norm": 0.306640625, "grad_norm_var": 0.05326226552327474, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.500287413597107, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19102803617715836, "step": 10492 }, { "epoch": 0.15668326626300033, "grad_norm": 0.296875, "grad_norm_var": 0.05348199208577474, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.518285036087036, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.191427543759346, "step": 10493 }, { "epoch": 0.1566981984336153, "grad_norm": 0.3515625, "grad_norm_var": 0.052828470865885414, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.799364686012268, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22560345381498337, "step": 10494 }, { "epoch": 0.15671313060423028, "grad_norm": 0.37109375, "grad_norm_var": 0.05283196767171224, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.5962084531784058, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20303454995155334, "step": 10495 }, { "epoch": 0.15672806277484527, "grad_norm": 0.341796875, "grad_norm_var": 0.0020774682362874348, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.5457286834716797, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.25080616772174835, "step": 10496 }, { "epoch": 0.15674299494546023, "grad_norm": 0.322265625, "grad_norm_var": 0.0020250797271728514, "learning_rate": 0.0001, "loss": 1.6551, "loss/crossentropy": 2.3572850227355957, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.25275974720716476, "step": 10497 }, { "epoch": 0.15675792711607522, "grad_norm": 0.3515625, "grad_norm_var": 0.0013376235961914062, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.4758716821670532, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2049538493156433, "step": 10498 }, { "epoch": 0.1567728592866902, "grad_norm": 0.298828125, "grad_norm_var": 0.0012865543365478515, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.792953848838806, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.23453252017498016, "step": 10499 }, { "epoch": 0.1567877914573052, "grad_norm": 0.47265625, "grad_norm_var": 0.0026972293853759766, "learning_rate": 0.0001, "loss": 1.5637, "loss/crossentropy": 2.641576886177063, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.24731094390153885, "step": 10500 }, { "epoch": 0.15680272362792017, "grad_norm": 0.3203125, "grad_norm_var": 0.0022526899973551433, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.7922592163085938, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17784971743822098, "step": 10501 }, { "epoch": 0.15681765579853515, "grad_norm": 0.318359375, "grad_norm_var": 0.002120717366536458, "learning_rate": 0.0001, "loss": 1.5584, "loss/crossentropy": 2.709144353866577, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2576502561569214, "step": 10502 }, { "epoch": 0.15683258796915014, "grad_norm": 0.3125, "grad_norm_var": 0.002132161458333333, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.4360822439193726, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.19147545099258423, "step": 10503 }, { "epoch": 0.1568475201397651, "grad_norm": 0.310546875, "grad_norm_var": 0.0020186742146809894, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.5019952058792114, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1987483724951744, "step": 10504 }, { "epoch": 0.1568624523103801, "grad_norm": 0.34375, "grad_norm_var": 0.001926406224568685, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.7551499605178833, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20239785313606262, "step": 10505 }, { "epoch": 0.15687738448099509, "grad_norm": 0.326171875, "grad_norm_var": 0.0018604914347330728, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.743988871574402, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2093588411808014, "step": 10506 }, { "epoch": 0.15689231665161005, "grad_norm": 0.318359375, "grad_norm_var": 0.001773516337076823, "learning_rate": 0.0001, "loss": 1.3107, "loss/crossentropy": 2.9146004915237427, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17003822326660156, "step": 10507 }, { "epoch": 0.15690724882222504, "grad_norm": 0.294921875, "grad_norm_var": 0.0018267313639322917, "learning_rate": 0.0001, "loss": 1.5048, "loss/crossentropy": 2.488862633705139, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21572408080101013, "step": 10508 }, { "epoch": 0.15692218099284003, "grad_norm": 0.314453125, "grad_norm_var": 0.0017579237620035807, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.32166188955307, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19222469627857208, "step": 10509 }, { "epoch": 0.15693711316345502, "grad_norm": 0.283203125, "grad_norm_var": 0.0019042332967122396, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.599663257598877, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19213347882032394, "step": 10510 }, { "epoch": 0.15695204533406998, "grad_norm": 0.322265625, "grad_norm_var": 0.0017941633860270182, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.412415623664856, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.20404725521802902, "step": 10511 }, { "epoch": 0.15696697750468497, "grad_norm": 0.33984375, "grad_norm_var": 0.0017908732096354166, "learning_rate": 0.0001, "loss": 1.4078, "loss/crossentropy": 2.2738043069839478, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.17738094925880432, "step": 10512 }, { "epoch": 0.15698190967529996, "grad_norm": 0.31640625, "grad_norm_var": 0.0017975966135660808, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.8519190549850464, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1970178410410881, "step": 10513 }, { "epoch": 0.15699684184591492, "grad_norm": 0.310546875, "grad_norm_var": 0.0017725626627604167, "learning_rate": 0.0001, "loss": 1.502, "loss/crossentropy": 2.792907238006592, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2363525554537773, "step": 10514 }, { "epoch": 0.1570117740165299, "grad_norm": 0.2890625, "grad_norm_var": 0.0018128554026285808, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.8295819759368896, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19262848049402237, "step": 10515 }, { "epoch": 0.1570267061871449, "grad_norm": 0.31640625, "grad_norm_var": 0.00025391578674316406, "learning_rate": 0.0001, "loss": 1.3607, "loss/crossentropy": 2.7039204835891724, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18883780390024185, "step": 10516 }, { "epoch": 0.1570416383577599, "grad_norm": 0.3125, "grad_norm_var": 0.00025200843811035156, "learning_rate": 0.0001, "loss": 1.3981, "loss/crossentropy": 2.4071102142333984, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1871897280216217, "step": 10517 }, { "epoch": 0.15705657052837485, "grad_norm": 0.27734375, "grad_norm_var": 0.00033512115478515623, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.6304250955581665, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.16943323612213135, "step": 10518 }, { "epoch": 0.15707150269898984, "grad_norm": 0.330078125, "grad_norm_var": 0.00035614967346191405, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.4876402616500854, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.20118896663188934, "step": 10519 }, { "epoch": 0.15708643486960483, "grad_norm": 0.384765625, "grad_norm_var": 0.0006774743398030599, "learning_rate": 0.0001, "loss": 1.6283, "loss/crossentropy": 2.7196414470672607, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2572440207004547, "step": 10520 }, { "epoch": 0.1571013670402198, "grad_norm": 0.333984375, "grad_norm_var": 0.000649261474609375, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.4441739320755005, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19799891114234924, "step": 10521 }, { "epoch": 0.15711629921083478, "grad_norm": 0.330078125, "grad_norm_var": 0.0006550470987955729, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.8698381185531616, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2041368931531906, "step": 10522 }, { "epoch": 0.15713123138144977, "grad_norm": 0.38671875, "grad_norm_var": 0.0009582360585530599, "learning_rate": 0.0001, "loss": 1.6151, "loss/crossentropy": 2.6199053525924683, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.24011824280023575, "step": 10523 }, { "epoch": 0.15714616355206473, "grad_norm": 0.291015625, "grad_norm_var": 0.0009729862213134766, "learning_rate": 0.0001, "loss": 1.3642, "loss/crossentropy": 2.646757125854492, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19233397394418716, "step": 10524 }, { "epoch": 0.15716109572267972, "grad_norm": 0.287109375, "grad_norm_var": 0.0010441939036051433, "learning_rate": 0.0001, "loss": 1.4512, "loss/crossentropy": 2.3838305473327637, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2129337042570114, "step": 10525 }, { "epoch": 0.1571760278932947, "grad_norm": 0.302734375, "grad_norm_var": 0.0009736220041910807, "learning_rate": 0.0001, "loss": 1.3378, "loss/crossentropy": 2.7788121700286865, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18153607100248337, "step": 10526 }, { "epoch": 0.1571909600639097, "grad_norm": 0.34375, "grad_norm_var": 0.0010070164998372396, "learning_rate": 0.0001, "loss": 1.4096, "loss/crossentropy": 2.7167998552322388, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.179108627140522, "step": 10527 }, { "epoch": 0.15720589223452466, "grad_norm": 0.265625, "grad_norm_var": 0.0011749267578125, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.516543388366699, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.18044211715459824, "step": 10528 }, { "epoch": 0.15722082440513965, "grad_norm": 0.318359375, "grad_norm_var": 0.00117491086324056, "learning_rate": 0.0001, "loss": 1.5466, "loss/crossentropy": 2.4406378269195557, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.2184598222374916, "step": 10529 }, { "epoch": 0.15723575657575464, "grad_norm": 0.3671875, "grad_norm_var": 0.0013228734334309897, "learning_rate": 0.0001, "loss": 1.6363, "loss/crossentropy": 2.340298652648926, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.23001600056886673, "step": 10530 }, { "epoch": 0.1572506887463696, "grad_norm": 0.28515625, "grad_norm_var": 0.001340484619140625, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.8080304861068726, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1998824030160904, "step": 10531 }, { "epoch": 0.1572656209169846, "grad_norm": 0.294921875, "grad_norm_var": 0.0013819217681884765, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.395872712135315, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1995839849114418, "step": 10532 }, { "epoch": 0.15728055308759958, "grad_norm": 0.296875, "grad_norm_var": 0.0014116764068603516, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.7700406312942505, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18456732481718063, "step": 10533 }, { "epoch": 0.15729548525821457, "grad_norm": 0.318359375, "grad_norm_var": 0.0012918472290039062, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.840429186820984, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2017553672194481, "step": 10534 }, { "epoch": 0.15731041742882954, "grad_norm": 0.2578125, "grad_norm_var": 0.0015312035878499348, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.5191848278045654, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1679983139038086, "step": 10535 }, { "epoch": 0.15732534959944453, "grad_norm": 0.337890625, "grad_norm_var": 0.00124204953511556, "learning_rate": 0.0001, "loss": 1.5605, "loss/crossentropy": 2.5757033824920654, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23629459738731384, "step": 10536 }, { "epoch": 0.15734028177005951, "grad_norm": 0.298828125, "grad_norm_var": 0.00122373898824056, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.6048721075057983, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20005779713392258, "step": 10537 }, { "epoch": 0.15735521394067448, "grad_norm": 0.337890625, "grad_norm_var": 0.0012470086415608723, "learning_rate": 0.0001, "loss": 1.527, "loss/crossentropy": 2.4467809200286865, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.21058566868305206, "step": 10538 }, { "epoch": 0.15737014611128947, "grad_norm": 0.296875, "grad_norm_var": 0.000855112075805664, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.450725793838501, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21605702489614487, "step": 10539 }, { "epoch": 0.15738507828190446, "grad_norm": 0.35546875, "grad_norm_var": 0.0009836196899414063, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.808698534965515, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1878572702407837, "step": 10540 }, { "epoch": 0.15740001045251942, "grad_norm": 0.296875, "grad_norm_var": 0.0009593804677327474, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.579189896583557, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20911535620689392, "step": 10541 }, { "epoch": 0.1574149426231344, "grad_norm": 0.392578125, "grad_norm_var": 0.0013659000396728516, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.680758833885193, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18853890895843506, "step": 10542 }, { "epoch": 0.1574298747937494, "grad_norm": 0.3203125, "grad_norm_var": 0.001315164566040039, "learning_rate": 0.0001, "loss": 1.5468, "loss/crossentropy": 2.6941709518432617, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2342977672815323, "step": 10543 }, { "epoch": 0.1574448069643644, "grad_norm": 0.3046875, "grad_norm_var": 0.0011530399322509765, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.662776827812195, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2198672741651535, "step": 10544 }, { "epoch": 0.15745973913497935, "grad_norm": 0.322265625, "grad_norm_var": 0.0011544386545817057, "learning_rate": 0.0001, "loss": 1.2567, "loss/crossentropy": 2.863307237625122, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.15899868309497833, "step": 10545 }, { "epoch": 0.15747467130559434, "grad_norm": 0.34375, "grad_norm_var": 0.0010342756907145182, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.627857208251953, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19273793697357178, "step": 10546 }, { "epoch": 0.15748960347620933, "grad_norm": 0.33984375, "grad_norm_var": 0.0009942213694254557, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.5832018852233887, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1734507530927658, "step": 10547 }, { "epoch": 0.1575045356468243, "grad_norm": 0.291015625, "grad_norm_var": 0.0010080814361572265, "learning_rate": 0.0001, "loss": 1.4398, "loss/crossentropy": 2.5586880445480347, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21325771510601044, "step": 10548 }, { "epoch": 0.15751946781743928, "grad_norm": 0.279296875, "grad_norm_var": 0.001080322265625, "learning_rate": 0.0001, "loss": 1.2859, "loss/crossentropy": 2.6046942472457886, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17260615527629852, "step": 10549 }, { "epoch": 0.15753439998805427, "grad_norm": 0.361328125, "grad_norm_var": 0.0011957168579101562, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.853895664215088, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22358239442110062, "step": 10550 }, { "epoch": 0.15754933215866926, "grad_norm": 0.318359375, "grad_norm_var": 0.0009143670399983724, "learning_rate": 0.0001, "loss": 1.4647, "loss/crossentropy": 2.666301727294922, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21473505347967148, "step": 10551 }, { "epoch": 0.15756426432928422, "grad_norm": 0.30859375, "grad_norm_var": 0.0009169896443684896, "learning_rate": 0.0001, "loss": 1.4704, "loss/crossentropy": 2.3620110750198364, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19691269099712372, "step": 10552 }, { "epoch": 0.1575791964998992, "grad_norm": 0.3828125, "grad_norm_var": 0.001087172826131185, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.5288885831832886, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.2316332310438156, "step": 10553 }, { "epoch": 0.1575941286705142, "grad_norm": 0.310546875, "grad_norm_var": 0.0010987440745035806, "learning_rate": 0.0001, "loss": 1.5138, "loss/crossentropy": 2.3251248598098755, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2286204770207405, "step": 10554 }, { "epoch": 0.15760906084112916, "grad_norm": 0.298828125, "grad_norm_var": 0.0010912577311197917, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.5253387689590454, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18432023376226425, "step": 10555 }, { "epoch": 0.15762399301174415, "grad_norm": 0.287109375, "grad_norm_var": 0.001120742162068685, "learning_rate": 0.0001, "loss": 1.4428, "loss/crossentropy": 2.5763179063796997, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2162828966975212, "step": 10556 }, { "epoch": 0.15763892518235914, "grad_norm": 0.341796875, "grad_norm_var": 0.00109405517578125, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.551624894142151, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19590231776237488, "step": 10557 }, { "epoch": 0.1576538573529741, "grad_norm": 0.30078125, "grad_norm_var": 0.0007959842681884765, "learning_rate": 0.0001, "loss": 1.333, "loss/crossentropy": 2.6986663341522217, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18063996732234955, "step": 10558 }, { "epoch": 0.1576687895235891, "grad_norm": 0.310546875, "grad_norm_var": 0.0008008321126302083, "learning_rate": 0.0001, "loss": 1.4031, "loss/crossentropy": 2.8218172788619995, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20385225862264633, "step": 10559 }, { "epoch": 0.15768372169420408, "grad_norm": 0.314453125, "grad_norm_var": 0.0007883548736572265, "learning_rate": 0.0001, "loss": 1.4202, "loss/crossentropy": 2.7391830682754517, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20927049219608307, "step": 10560 }, { "epoch": 0.15769865386481907, "grad_norm": 0.255859375, "grad_norm_var": 0.0010391076405843098, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.4021776914596558, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1729394793510437, "step": 10561 }, { "epoch": 0.15771358603543403, "grad_norm": 0.3671875, "grad_norm_var": 0.0011623223622639974, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.703902840614319, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20528669655323029, "step": 10562 }, { "epoch": 0.15772851820604902, "grad_norm": 0.31640625, "grad_norm_var": 0.00112455685933431, "learning_rate": 0.0001, "loss": 1.4908, "loss/crossentropy": 2.3734229803085327, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.19781382381916046, "step": 10563 }, { "epoch": 0.157743450376664, "grad_norm": 0.375, "grad_norm_var": 0.0012933731079101563, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.4042234420776367, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.17139491438865662, "step": 10564 }, { "epoch": 0.15775838254727897, "grad_norm": 0.2890625, "grad_norm_var": 0.0012456099192301431, "learning_rate": 0.0001, "loss": 1.4262, "loss/crossentropy": 2.4451773166656494, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21137912571430206, "step": 10565 }, { "epoch": 0.15777331471789396, "grad_norm": 0.341796875, "grad_norm_var": 0.001164865493774414, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.740541100502014, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18911197036504745, "step": 10566 }, { "epoch": 0.15778824688850895, "grad_norm": 0.314453125, "grad_norm_var": 0.0011666456858317057, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.824536085128784, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21951545029878616, "step": 10567 }, { "epoch": 0.15780317905912392, "grad_norm": 0.298828125, "grad_norm_var": 0.0011870702107747396, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.734873414039612, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19663137197494507, "step": 10568 }, { "epoch": 0.1578181112297389, "grad_norm": 0.29296875, "grad_norm_var": 0.0009282430013020834, "learning_rate": 0.0001, "loss": 1.2553, "loss/crossentropy": 2.5454559326171875, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.1654125303030014, "step": 10569 }, { "epoch": 0.1578330434003539, "grad_norm": 0.455078125, "grad_norm_var": 0.0021773656209309895, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.5682660341262817, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2110636979341507, "step": 10570 }, { "epoch": 0.15784797557096888, "grad_norm": 0.333984375, "grad_norm_var": 0.002143605550130208, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.333440661430359, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19310858100652695, "step": 10571 }, { "epoch": 0.15786290774158385, "grad_norm": 0.294921875, "grad_norm_var": 0.0021082560221354165, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.618277668952942, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16263563930988312, "step": 10572 }, { "epoch": 0.15787783991219884, "grad_norm": 0.404296875, "grad_norm_var": 0.002490743001302083, "learning_rate": 0.0001, "loss": 1.6056, "loss/crossentropy": 2.1936694383621216, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2540838271379471, "step": 10573 }, { "epoch": 0.15789277208281383, "grad_norm": 0.41015625, "grad_norm_var": 0.0028254191080729165, "learning_rate": 0.0001, "loss": 1.6652, "loss/crossentropy": 2.1249696016311646, "loss/fcd": 1.44921875, "loss/idx": 11.0, "loss/logits": 0.2160107046365738, "step": 10574 }, { "epoch": 0.1579077042534288, "grad_norm": 0.30859375, "grad_norm_var": 0.0028322696685791015, "learning_rate": 0.0001, "loss": 1.4198, "loss/crossentropy": 2.7755727767944336, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19322321563959122, "step": 10575 }, { "epoch": 0.15792263642404378, "grad_norm": 0.3515625, "grad_norm_var": 0.002812639872233073, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.633242130279541, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18795980513095856, "step": 10576 }, { "epoch": 0.15793756859465877, "grad_norm": 0.318359375, "grad_norm_var": 0.0023711522420247397, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.7568893432617188, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17985429614782333, "step": 10577 }, { "epoch": 0.15795250076527376, "grad_norm": 0.345703125, "grad_norm_var": 0.0023279666900634767, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.7106913328170776, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20477253198623657, "step": 10578 }, { "epoch": 0.15796743293588872, "grad_norm": 0.30859375, "grad_norm_var": 0.0023570855458577475, "learning_rate": 0.0001, "loss": 1.4903, "loss/crossentropy": 2.8857699632644653, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22862902283668518, "step": 10579 }, { "epoch": 0.1579823651065037, "grad_norm": 0.32421875, "grad_norm_var": 0.00228269894917806, "learning_rate": 0.0001, "loss": 1.6301, "loss/crossentropy": 2.6928930282592773, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.2551404759287834, "step": 10580 }, { "epoch": 0.1579972972771187, "grad_norm": 0.359375, "grad_norm_var": 0.002141936620076497, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.416994094848633, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.20596610754728317, "step": 10581 }, { "epoch": 0.15801222944773366, "grad_norm": 0.326171875, "grad_norm_var": 0.0021564324696858725, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.3795324563980103, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18249453604221344, "step": 10582 }, { "epoch": 0.15802716161834865, "grad_norm": 0.3125, "grad_norm_var": 0.0021634419759114583, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.5086841583251953, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.1871797814965248, "step": 10583 }, { "epoch": 0.15804209378896364, "grad_norm": 0.302734375, "grad_norm_var": 0.002142779032389323, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.554115891456604, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19414085894823074, "step": 10584 }, { "epoch": 0.1580570259595786, "grad_norm": 0.267578125, "grad_norm_var": 0.0023442427317301433, "learning_rate": 0.0001, "loss": 1.264, "loss/crossentropy": 2.4661705493927, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.1624680832028389, "step": 10585 }, { "epoch": 0.1580719581301936, "grad_norm": 0.275390625, "grad_norm_var": 0.0015809218088785808, "learning_rate": 0.0001, "loss": 1.262, "loss/crossentropy": 2.6735259294509888, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16433065384626389, "step": 10586 }, { "epoch": 0.15808689030080858, "grad_norm": 0.296875, "grad_norm_var": 0.001636187235514323, "learning_rate": 0.0001, "loss": 1.2998, "loss/crossentropy": 2.594533681869507, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17085903137922287, "step": 10587 }, { "epoch": 0.15810182247142357, "grad_norm": 0.29296875, "grad_norm_var": 0.0016443729400634766, "learning_rate": 0.0001, "loss": 1.4349, "loss/crossentropy": 2.4948219060897827, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21615423262119293, "step": 10588 }, { "epoch": 0.15811675464203853, "grad_norm": 0.279296875, "grad_norm_var": 0.00130461057027181, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.590007781982422, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1752525195479393, "step": 10589 }, { "epoch": 0.15813168681265352, "grad_norm": 0.32421875, "grad_norm_var": 0.0007045586903889974, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.495219111442566, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21605601161718369, "step": 10590 }, { "epoch": 0.1581466189832685, "grad_norm": 0.287109375, "grad_norm_var": 0.0007435480753580729, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.5152881145477295, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18326031416654587, "step": 10591 }, { "epoch": 0.15816155115388347, "grad_norm": 0.29296875, "grad_norm_var": 0.0006395975748697916, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.731284737586975, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17011937499046326, "step": 10592 }, { "epoch": 0.15817648332449846, "grad_norm": 0.328125, "grad_norm_var": 0.0006601810455322266, "learning_rate": 0.0001, "loss": 1.5613, "loss/crossentropy": 2.675346851348877, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23312994837760925, "step": 10593 }, { "epoch": 0.15819141549511345, "grad_norm": 0.3203125, "grad_norm_var": 0.0005719502766927083, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.4708824157714844, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19789032638072968, "step": 10594 }, { "epoch": 0.15820634766572844, "grad_norm": 0.3125, "grad_norm_var": 0.000574175516764323, "learning_rate": 0.0001, "loss": 1.4624, "loss/crossentropy": 2.6374692916870117, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2085031270980835, "step": 10595 }, { "epoch": 0.1582212798363434, "grad_norm": 0.3359375, "grad_norm_var": 0.0006106058756510417, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.5586585998535156, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22560087591409683, "step": 10596 }, { "epoch": 0.1582362120069584, "grad_norm": 0.291015625, "grad_norm_var": 0.0004264672597249349, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.419962167739868, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17533504217863083, "step": 10597 }, { "epoch": 0.15825114417757338, "grad_norm": 0.3203125, "grad_norm_var": 0.00041039784749348957, "learning_rate": 0.0001, "loss": 1.3679, "loss/crossentropy": 2.6465178728103638, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19207361340522766, "step": 10598 }, { "epoch": 0.15826607634818834, "grad_norm": 0.318359375, "grad_norm_var": 0.0004203637440999349, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.5128144025802612, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1918230950832367, "step": 10599 }, { "epoch": 0.15828100851880333, "grad_norm": 0.3125, "grad_norm_var": 0.0004261652628580729, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.4180943965911865, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.24619555473327637, "step": 10600 }, { "epoch": 0.15829594068941832, "grad_norm": 0.32421875, "grad_norm_var": 0.00035564104715983075, "learning_rate": 0.0001, "loss": 1.3503, "loss/crossentropy": 2.7590832710266113, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18619627505540848, "step": 10601 }, { "epoch": 0.15831087286003329, "grad_norm": 0.341796875, "grad_norm_var": 0.0003513177235921224, "learning_rate": 0.0001, "loss": 1.5257, "loss/crossentropy": 2.8574951887130737, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2209724336862564, "step": 10602 }, { "epoch": 0.15832580503064828, "grad_norm": 0.34765625, "grad_norm_var": 0.0004157861073811849, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.460646390914917, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20036283135414124, "step": 10603 }, { "epoch": 0.15834073720126327, "grad_norm": 0.30859375, "grad_norm_var": 0.0003865400950113932, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.860232353210449, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20584381371736526, "step": 10604 }, { "epoch": 0.15835566937187825, "grad_norm": 0.34765625, "grad_norm_var": 0.00035037994384765623, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.536958336830139, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18854256719350815, "step": 10605 }, { "epoch": 0.15837060154249322, "grad_norm": 0.27734375, "grad_norm_var": 0.00045871734619140625, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.555080533027649, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1888819932937622, "step": 10606 }, { "epoch": 0.1583855337131082, "grad_norm": 0.279296875, "grad_norm_var": 0.0004933039347330729, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.60428524017334, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1906372606754303, "step": 10607 }, { "epoch": 0.1584004658837232, "grad_norm": 0.34375, "grad_norm_var": 0.0004974365234375, "learning_rate": 0.0001, "loss": 1.516, "loss/crossentropy": 2.5599530935287476, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23084527999162674, "step": 10608 }, { "epoch": 0.15841539805433816, "grad_norm": 0.36328125, "grad_norm_var": 0.0006158828735351562, "learning_rate": 0.0001, "loss": 1.6768, "loss/crossentropy": 2.5072754621505737, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.28613221645355225, "step": 10609 }, { "epoch": 0.15843033022495315, "grad_norm": 0.306640625, "grad_norm_var": 0.0006297906239827474, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.3965665102005005, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.20311465114355087, "step": 10610 }, { "epoch": 0.15844526239556814, "grad_norm": 0.287109375, "grad_norm_var": 0.0006977717081705729, "learning_rate": 0.0001, "loss": 1.3083, "loss/crossentropy": 2.49821138381958, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16376206278800964, "step": 10611 }, { "epoch": 0.15846019456618313, "grad_norm": 0.314453125, "grad_norm_var": 0.0006783644358317058, "learning_rate": 0.0001, "loss": 1.5418, "loss/crossentropy": 2.285956859588623, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.24488752335309982, "step": 10612 }, { "epoch": 0.1584751267367981, "grad_norm": 0.330078125, "grad_norm_var": 0.0006344954172770183, "learning_rate": 0.0001, "loss": 1.5358, "loss/crossentropy": 2.5811896324157715, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22332604229450226, "step": 10613 }, { "epoch": 0.15849005890741308, "grad_norm": 0.322265625, "grad_norm_var": 0.000634765625, "learning_rate": 0.0001, "loss": 1.734, "loss/crossentropy": 2.49515700340271, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.3004274293780327, "step": 10614 }, { "epoch": 0.15850499107802807, "grad_norm": 0.357421875, "grad_norm_var": 0.0007199605305989583, "learning_rate": 0.0001, "loss": 1.4355, "loss/crossentropy": 2.687081217765808, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.18157930672168732, "step": 10615 }, { "epoch": 0.15851992324864303, "grad_norm": 0.302734375, "grad_norm_var": 0.0007392724355061848, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.618167996406555, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1782500222325325, "step": 10616 }, { "epoch": 0.15853485541925802, "grad_norm": 0.34765625, "grad_norm_var": 0.0007800896962483724, "learning_rate": 0.0001, "loss": 1.5388, "loss/crossentropy": 2.640005588531494, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.21852320432662964, "step": 10617 }, { "epoch": 0.158549787589873, "grad_norm": 0.33203125, "grad_norm_var": 0.0007623672485351562, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.6955631971359253, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2344045490026474, "step": 10618 }, { "epoch": 0.15856471976048797, "grad_norm": 0.298828125, "grad_norm_var": 0.0007508436838785808, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.6668068170547485, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19243364036083221, "step": 10619 }, { "epoch": 0.15857965193110296, "grad_norm": 0.314453125, "grad_norm_var": 0.0007441202799479167, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.5351701974868774, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22151905298233032, "step": 10620 }, { "epoch": 0.15859458410171795, "grad_norm": 0.3515625, "grad_norm_var": 0.0007593154907226563, "learning_rate": 0.0001, "loss": 1.504, "loss/crossentropy": 2.5168555974960327, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.20716256648302078, "step": 10621 }, { "epoch": 0.15860951627233294, "grad_norm": 0.353515625, "grad_norm_var": 0.0006830692291259766, "learning_rate": 0.0001, "loss": 1.4554, "loss/crossentropy": 2.6338623762130737, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2249266505241394, "step": 10622 }, { "epoch": 0.1586244484429479, "grad_norm": 0.359375, "grad_norm_var": 0.0005924860636393229, "learning_rate": 0.0001, "loss": 1.3031, "loss/crossentropy": 2.5937291383743286, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16640616953372955, "step": 10623 }, { "epoch": 0.1586393806135629, "grad_norm": 0.333984375, "grad_norm_var": 0.0005809624989827473, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.6367125511169434, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1607740968465805, "step": 10624 }, { "epoch": 0.15865431278417788, "grad_norm": 0.298828125, "grad_norm_var": 0.0005521138509114583, "learning_rate": 0.0001, "loss": 1.3707, "loss/crossentropy": 2.590811014175415, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1910163089632988, "step": 10625 }, { "epoch": 0.15866924495479284, "grad_norm": 0.28125, "grad_norm_var": 0.0006568749745686849, "learning_rate": 0.0001, "loss": 1.2609, "loss/crossentropy": 2.5255160331726074, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.15929213166236877, "step": 10626 }, { "epoch": 0.15868417712540783, "grad_norm": 0.2734375, "grad_norm_var": 0.0007359822591145833, "learning_rate": 0.0001, "loss": 1.3964, "loss/crossentropy": 2.479197859764099, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18932511657476425, "step": 10627 }, { "epoch": 0.15869910929602282, "grad_norm": 0.302734375, "grad_norm_var": 0.0007582982381184896, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.7380964756011963, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17594555020332336, "step": 10628 }, { "epoch": 0.15871404146663778, "grad_norm": 0.287109375, "grad_norm_var": 0.0008303324381510416, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 2.4393390417099, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2095516100525856, "step": 10629 }, { "epoch": 0.15872897363725277, "grad_norm": 0.330078125, "grad_norm_var": 0.0008366902669270834, "learning_rate": 0.0001, "loss": 1.5428, "loss/crossentropy": 2.5189778804779053, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.22640472650527954, "step": 10630 }, { "epoch": 0.15874390580786776, "grad_norm": 0.68359375, "grad_norm_var": 0.009099817276000977, "learning_rate": 0.0001, "loss": 2.4502, "loss/crossentropy": 2.30048143863678, "loss/fcd": 1.98828125, "loss/idx": 11.0, "loss/logits": 0.46196798980236053, "step": 10631 }, { "epoch": 0.15875883797848275, "grad_norm": 0.390625, "grad_norm_var": 0.009137725830078125, "learning_rate": 0.0001, "loss": 1.7364, "loss/crossentropy": 2.468843936920166, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.29494763910770416, "step": 10632 }, { "epoch": 0.15877377014909771, "grad_norm": 0.33203125, "grad_norm_var": 0.009149932861328125, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.6390024423599243, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20181762427091599, "step": 10633 }, { "epoch": 0.1587887023197127, "grad_norm": 0.306640625, "grad_norm_var": 0.009234857559204102, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.691365957260132, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2033511847257614, "step": 10634 }, { "epoch": 0.1588036344903277, "grad_norm": 0.326171875, "grad_norm_var": 0.009118254979451497, "learning_rate": 0.0001, "loss": 1.5391, "loss/crossentropy": 2.5825010538101196, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2304685339331627, "step": 10635 }, { "epoch": 0.15881856666094266, "grad_norm": 0.333984375, "grad_norm_var": 0.009061670303344727, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.5525619983673096, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.23544856160879135, "step": 10636 }, { "epoch": 0.15883349883155765, "grad_norm": 0.3515625, "grad_norm_var": 0.009061670303344727, "learning_rate": 0.0001, "loss": 1.5745, "loss/crossentropy": 2.2561320066452026, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2034284546971321, "step": 10637 }, { "epoch": 0.15884843100217264, "grad_norm": 0.26953125, "grad_norm_var": 0.009424591064453125, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.4347779750823975, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19959212094545364, "step": 10638 }, { "epoch": 0.15886336317278762, "grad_norm": 0.291015625, "grad_norm_var": 0.00955198605855306, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.401434540748596, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20131374150514603, "step": 10639 }, { "epoch": 0.1588782953434026, "grad_norm": 0.3125, "grad_norm_var": 0.009589576721191406, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.6774396896362305, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20327433943748474, "step": 10640 }, { "epoch": 0.15889322751401758, "grad_norm": 0.306640625, "grad_norm_var": 0.00955499013264974, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.7565783262252808, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21161732822656631, "step": 10641 }, { "epoch": 0.15890815968463257, "grad_norm": 0.27734375, "grad_norm_var": 0.009584554036458333, "learning_rate": 0.0001, "loss": 1.3335, "loss/crossentropy": 2.505897641181946, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1694125011563301, "step": 10642 }, { "epoch": 0.15892309185524753, "grad_norm": 0.349609375, "grad_norm_var": 0.00931242307027181, "learning_rate": 0.0001, "loss": 1.4526, "loss/crossentropy": 2.7464962005615234, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.19865204393863678, "step": 10643 }, { "epoch": 0.15893802402586252, "grad_norm": 0.361328125, "grad_norm_var": 0.009230407079060872, "learning_rate": 0.0001, "loss": 1.4689, "loss/crossentropy": 2.529602289199829, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.1954352781176567, "step": 10644 }, { "epoch": 0.1589529561964775, "grad_norm": 0.345703125, "grad_norm_var": 0.008997710545857747, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.5707463026046753, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21143049001693726, "step": 10645 }, { "epoch": 0.15896788836709247, "grad_norm": 0.35546875, "grad_norm_var": 0.008977254231770834, "learning_rate": 0.0001, "loss": 1.5792, "loss/crossentropy": 2.6653547286987305, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.2393735721707344, "step": 10646 }, { "epoch": 0.15898282053770746, "grad_norm": 0.400390625, "grad_norm_var": 0.0013786156972249348, "learning_rate": 0.0001, "loss": 1.6717, "loss/crossentropy": 2.3825953006744385, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.23809245228767395, "step": 10647 }, { "epoch": 0.15899775270832245, "grad_norm": 0.29296875, "grad_norm_var": 0.0012101332346598308, "learning_rate": 0.0001, "loss": 1.3413, "loss/crossentropy": 2.5239486694335938, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1929098442196846, "step": 10648 }, { "epoch": 0.15901268487893744, "grad_norm": 0.32421875, "grad_norm_var": 0.0012074629465738931, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.7235820293426514, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19544600695371628, "step": 10649 }, { "epoch": 0.1590276170495524, "grad_norm": 0.310546875, "grad_norm_var": 0.0011986891428629558, "learning_rate": 0.0001, "loss": 1.4239, "loss/crossentropy": 2.3889150619506836, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2090531513094902, "step": 10650 }, { "epoch": 0.1590425492201674, "grad_norm": 0.28125, "grad_norm_var": 0.0013211568196614583, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.5519427061080933, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19498923420906067, "step": 10651 }, { "epoch": 0.15905748139078238, "grad_norm": 0.291015625, "grad_norm_var": 0.0013722101847330728, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.598230481147766, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18804359436035156, "step": 10652 }, { "epoch": 0.15907241356139734, "grad_norm": 0.306640625, "grad_norm_var": 0.0013096968332926432, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.7516947984695435, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21365493535995483, "step": 10653 }, { "epoch": 0.15908734573201233, "grad_norm": 0.337890625, "grad_norm_var": 0.0011667251586914063, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.7494930028915405, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18082469701766968, "step": 10654 }, { "epoch": 0.15910227790262732, "grad_norm": 0.48046875, "grad_norm_var": 0.0026391188303629557, "learning_rate": 0.0001, "loss": 1.6223, "loss/crossentropy": 2.5586098432540894, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.25900811702013016, "step": 10655 }, { "epoch": 0.1591172100732423, "grad_norm": 0.3671875, "grad_norm_var": 0.0026738325754801433, "learning_rate": 0.0001, "loss": 1.4197, "loss/crossentropy": 2.7598373889923096, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2009091004729271, "step": 10656 }, { "epoch": 0.15913214224385727, "grad_norm": 0.330078125, "grad_norm_var": 0.0026139418284098306, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.695116877555847, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19348366558551788, "step": 10657 }, { "epoch": 0.15914707441447226, "grad_norm": 0.328125, "grad_norm_var": 0.002362680435180664, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.589065432548523, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19605828821659088, "step": 10658 }, { "epoch": 0.15916200658508725, "grad_norm": 0.322265625, "grad_norm_var": 0.002379592259724935, "learning_rate": 0.0001, "loss": 1.4565, "loss/crossentropy": 2.671596884727478, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.22213231027126312, "step": 10659 }, { "epoch": 0.1591769387557022, "grad_norm": 0.330078125, "grad_norm_var": 0.002350600560506185, "learning_rate": 0.0001, "loss": 1.4343, "loss/crossentropy": 2.6474013328552246, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19606202840805054, "step": 10660 }, { "epoch": 0.1591918709263172, "grad_norm": 0.34765625, "grad_norm_var": 0.0023529052734375, "learning_rate": 0.0001, "loss": 1.673, "loss/crossentropy": 2.583558440208435, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.23548482358455658, "step": 10661 }, { "epoch": 0.1592068030969322, "grad_norm": 0.265625, "grad_norm_var": 0.0026468276977539063, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.6495126485824585, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17231891304254532, "step": 10662 }, { "epoch": 0.15922173526754715, "grad_norm": 0.318359375, "grad_norm_var": 0.0023223876953125, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5017069578170776, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2166512906551361, "step": 10663 }, { "epoch": 0.15923666743816214, "grad_norm": 0.298828125, "grad_norm_var": 0.002297830581665039, "learning_rate": 0.0001, "loss": 1.2571, "loss/crossentropy": 2.654259204864502, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1594603955745697, "step": 10664 }, { "epoch": 0.15925159960877713, "grad_norm": 0.353515625, "grad_norm_var": 0.002338600158691406, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 2.7171902656555176, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22453753650188446, "step": 10665 }, { "epoch": 0.15926653177939212, "grad_norm": 0.35546875, "grad_norm_var": 0.0023521264394124348, "learning_rate": 0.0001, "loss": 1.5548, "loss/crossentropy": 2.4420074224472046, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22279708087444305, "step": 10666 }, { "epoch": 0.15928146395000709, "grad_norm": 0.361328125, "grad_norm_var": 0.0022094090779622394, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.569597005844116, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18495924770832062, "step": 10667 }, { "epoch": 0.15929639612062207, "grad_norm": 0.30859375, "grad_norm_var": 0.0021205743153889975, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 2.5101126432418823, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2194986417889595, "step": 10668 }, { "epoch": 0.15931132829123706, "grad_norm": 0.30859375, "grad_norm_var": 0.002112579345703125, "learning_rate": 0.0001, "loss": 1.3624, "loss/crossentropy": 2.8020436763763428, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18665991723537445, "step": 10669 }, { "epoch": 0.15932626046185203, "grad_norm": 0.326171875, "grad_norm_var": 0.0021219253540039062, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.733223080635071, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2119007185101509, "step": 10670 }, { "epoch": 0.15934119263246702, "grad_norm": 0.42578125, "grad_norm_var": 0.0012674331665039062, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.9907853603363037, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2460446059703827, "step": 10671 }, { "epoch": 0.159356124803082, "grad_norm": 0.44921875, "grad_norm_var": 0.002048492431640625, "learning_rate": 0.0001, "loss": 1.4414, "loss/crossentropy": 2.7708001136779785, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21094051748514175, "step": 10672 }, { "epoch": 0.159371056973697, "grad_norm": 0.298828125, "grad_norm_var": 0.0021481831868489583, "learning_rate": 0.0001, "loss": 1.3697, "loss/crossentropy": 2.7745217084884644, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1900324523448944, "step": 10673 }, { "epoch": 0.15938598914431196, "grad_norm": 0.310546875, "grad_norm_var": 0.0021892388661702475, "learning_rate": 0.0001, "loss": 1.3557, "loss/crossentropy": 2.7335182428359985, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18382714688777924, "step": 10674 }, { "epoch": 0.15940092131492695, "grad_norm": 0.3515625, "grad_norm_var": 0.0021880467732747397, "learning_rate": 0.0001, "loss": 1.6458, "loss/crossentropy": 2.697422981262207, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.2668936923146248, "step": 10675 }, { "epoch": 0.15941585348554194, "grad_norm": 0.392578125, "grad_norm_var": 0.0023650487263997396, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.537269949913025, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.21704863756895065, "step": 10676 }, { "epoch": 0.1594307856561569, "grad_norm": 0.3671875, "grad_norm_var": 0.002403513590494792, "learning_rate": 0.0001, "loss": 1.6973, "loss/crossentropy": 2.3570737838745117, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.26367151737213135, "step": 10677 }, { "epoch": 0.1594457178267719, "grad_norm": 0.294921875, "grad_norm_var": 0.002153889338175456, "learning_rate": 0.0001, "loss": 1.3511, "loss/crossentropy": 2.515329957008362, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18706945329904556, "step": 10678 }, { "epoch": 0.15946064999738688, "grad_norm": 0.33203125, "grad_norm_var": 0.0021168390909830728, "learning_rate": 0.0001, "loss": 1.4227, "loss/crossentropy": 2.6980937719345093, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20003880560398102, "step": 10679 }, { "epoch": 0.15947558216800184, "grad_norm": 0.314453125, "grad_norm_var": 0.0020339330037434894, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.4932466745376587, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20897381007671356, "step": 10680 }, { "epoch": 0.15949051433861683, "grad_norm": 0.31640625, "grad_norm_var": 0.00208738644917806, "learning_rate": 0.0001, "loss": 1.5394, "loss/crossentropy": 2.3942900896072388, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.21911638975143433, "step": 10681 }, { "epoch": 0.15950544650923182, "grad_norm": 0.32421875, "grad_norm_var": 0.002103153864542643, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.562601685523987, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20388439297676086, "step": 10682 }, { "epoch": 0.1595203786798468, "grad_norm": 0.3359375, "grad_norm_var": 0.002080217997233073, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.7973052263259888, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1915362849831581, "step": 10683 }, { "epoch": 0.15953531085046177, "grad_norm": 0.322265625, "grad_norm_var": 0.0020327091217041014, "learning_rate": 0.0001, "loss": 1.4042, "loss/crossentropy": 2.5448875427246094, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18149984627962112, "step": 10684 }, { "epoch": 0.15955024302107676, "grad_norm": 0.30078125, "grad_norm_var": 0.002071237564086914, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.784417748451233, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19296953082084656, "step": 10685 }, { "epoch": 0.15956517519169175, "grad_norm": 0.359375, "grad_norm_var": 0.002072588602701823, "learning_rate": 0.0001, "loss": 1.6109, "loss/crossentropy": 2.6979836225509644, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2554556503891945, "step": 10686 }, { "epoch": 0.1595801073623067, "grad_norm": 0.36328125, "grad_norm_var": 0.0016311009724934895, "learning_rate": 0.0001, "loss": 1.5818, "loss/crossentropy": 2.6035739183425903, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2263151779770851, "step": 10687 }, { "epoch": 0.1595950395329217, "grad_norm": 0.283203125, "grad_norm_var": 0.0009272098541259766, "learning_rate": 0.0001, "loss": 1.3455, "loss/crossentropy": 2.422590494155884, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1853603720664978, "step": 10688 }, { "epoch": 0.1596099717035367, "grad_norm": 0.29296875, "grad_norm_var": 0.0009531021118164062, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.432340145111084, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18304969370365143, "step": 10689 }, { "epoch": 0.15962490387415165, "grad_norm": 0.291015625, "grad_norm_var": 0.001024627685546875, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.6379905939102173, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19032137095928192, "step": 10690 }, { "epoch": 0.15963983604476664, "grad_norm": 0.326171875, "grad_norm_var": 0.0009839216868082683, "learning_rate": 0.0001, "loss": 1.3818, "loss/crossentropy": 2.39658260345459, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19039873778820038, "step": 10691 }, { "epoch": 0.15965476821538163, "grad_norm": 0.3125, "grad_norm_var": 0.0006743748982747396, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.755913019180298, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2084077149629593, "step": 10692 }, { "epoch": 0.15966970038599662, "grad_norm": 0.36328125, "grad_norm_var": 0.0006512959798177083, "learning_rate": 0.0001, "loss": 1.6193, "loss/crossentropy": 2.4847406148910522, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.2442629039287567, "step": 10693 }, { "epoch": 0.15968463255661158, "grad_norm": 0.294921875, "grad_norm_var": 0.0006512959798177083, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.8015024662017822, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.22317640483379364, "step": 10694 }, { "epoch": 0.15969956472722657, "grad_norm": 0.314453125, "grad_norm_var": 0.0006442864735921224, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.7663615942001343, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2028431072831154, "step": 10695 }, { "epoch": 0.15971449689784156, "grad_norm": 0.31640625, "grad_norm_var": 0.000643157958984375, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.463612675666809, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20645327866077423, "step": 10696 }, { "epoch": 0.15972942906845652, "grad_norm": 0.310546875, "grad_norm_var": 0.0006479740142822266, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.729974389076233, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1993536651134491, "step": 10697 }, { "epoch": 0.15974436123907151, "grad_norm": 0.3046875, "grad_norm_var": 0.0006594181060791016, "learning_rate": 0.0001, "loss": 1.3661, "loss/crossentropy": 2.5009645223617554, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18638578802347183, "step": 10698 }, { "epoch": 0.1597592934096865, "grad_norm": 0.341796875, "grad_norm_var": 0.0006753921508789063, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.791929006576538, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21847564727067947, "step": 10699 }, { "epoch": 0.1597742255803015, "grad_norm": 0.3125, "grad_norm_var": 0.0006765842437744141, "learning_rate": 0.0001, "loss": 1.3693, "loss/crossentropy": 2.5586084127426147, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.17792122811079025, "step": 10700 }, { "epoch": 0.15978915775091646, "grad_norm": 0.3125, "grad_norm_var": 0.0006582736968994141, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.607755661010742, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1805480718612671, "step": 10701 }, { "epoch": 0.15980408992153144, "grad_norm": 0.2890625, "grad_norm_var": 0.0005861759185791015, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.680632472038269, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20274463295936584, "step": 10702 }, { "epoch": 0.15981902209214643, "grad_norm": 0.32421875, "grad_norm_var": 0.0004265944163004557, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.6909995079040527, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17949050664901733, "step": 10703 }, { "epoch": 0.1598339542627614, "grad_norm": 0.33984375, "grad_norm_var": 0.00041046142578125, "learning_rate": 0.0001, "loss": 1.544, "loss/crossentropy": 2.4808300733566284, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.23151970654726028, "step": 10704 }, { "epoch": 0.15984888643337639, "grad_norm": 0.31640625, "grad_norm_var": 0.000374603271484375, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.6013777256011963, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21701134741306305, "step": 10705 }, { "epoch": 0.15986381860399138, "grad_norm": 0.28125, "grad_norm_var": 0.0004142602284749349, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.7655982971191406, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2147824838757515, "step": 10706 }, { "epoch": 0.15987875077460634, "grad_norm": 0.353515625, "grad_norm_var": 0.0004970391591389973, "learning_rate": 0.0001, "loss": 1.6336, "loss/crossentropy": 2.796436905860901, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.26249320805072784, "step": 10707 }, { "epoch": 0.15989368294522133, "grad_norm": 0.30078125, "grad_norm_var": 0.0005142052968343098, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.8593103885650635, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18145033717155457, "step": 10708 }, { "epoch": 0.15990861511583632, "grad_norm": 0.33984375, "grad_norm_var": 0.0004047234853108724, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.6582038402557373, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.26299789547920227, "step": 10709 }, { "epoch": 0.1599235472864513, "grad_norm": 0.306640625, "grad_norm_var": 0.0003806908925374349, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 2.630557656288147, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19383440911769867, "step": 10710 }, { "epoch": 0.15993847945706627, "grad_norm": 0.30078125, "grad_norm_var": 0.00039615631103515623, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.794037342071533, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20201584696769714, "step": 10711 }, { "epoch": 0.15995341162768126, "grad_norm": 0.3203125, "grad_norm_var": 0.000397491455078125, "learning_rate": 0.0001, "loss": 1.4837, "loss/crossentropy": 2.765730142593384, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22194644063711166, "step": 10712 }, { "epoch": 0.15996834379829625, "grad_norm": 0.275390625, "grad_norm_var": 0.0004999160766601563, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.3282517194747925, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2247365638613701, "step": 10713 }, { "epoch": 0.1599832759689112, "grad_norm": 0.287109375, "grad_norm_var": 0.0005403995513916015, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.672971248626709, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.19635476171970367, "step": 10714 }, { "epoch": 0.1599982081395262, "grad_norm": 0.283203125, "grad_norm_var": 0.000527048110961914, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.615836262702942, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18936587870121002, "step": 10715 }, { "epoch": 0.1600131403101412, "grad_norm": 0.3125, "grad_norm_var": 0.000527048110961914, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.44684898853302, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.23046231269836426, "step": 10716 }, { "epoch": 0.16002807248075618, "grad_norm": 0.365234375, "grad_norm_var": 0.0007257461547851562, "learning_rate": 0.0001, "loss": 1.483, "loss/crossentropy": 2.409078359603882, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21347397565841675, "step": 10717 }, { "epoch": 0.16004300465137114, "grad_norm": 0.328125, "grad_norm_var": 0.0007003148396809896, "learning_rate": 0.0001, "loss": 1.4117, "loss/crossentropy": 2.4995373487472534, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18905442208051682, "step": 10718 }, { "epoch": 0.16005793682198613, "grad_norm": 0.283203125, "grad_norm_var": 0.0007533868153889974, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.322161912918091, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20657768100500107, "step": 10719 }, { "epoch": 0.16007286899260112, "grad_norm": 0.318359375, "grad_norm_var": 0.0007028579711914062, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.678735375404358, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20894578099250793, "step": 10720 }, { "epoch": 0.16008780116321608, "grad_norm": 0.279296875, "grad_norm_var": 0.0007611433664957682, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 2.6003161668777466, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18310077488422394, "step": 10721 }, { "epoch": 0.16010273333383107, "grad_norm": 0.294921875, "grad_norm_var": 0.0007232030232747396, "learning_rate": 0.0001, "loss": 1.3473, "loss/crossentropy": 2.749300479888916, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.175467349588871, "step": 10722 }, { "epoch": 0.16011766550444606, "grad_norm": 0.2890625, "grad_norm_var": 0.0006030877431233724, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.7074960470199585, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1932772919535637, "step": 10723 }, { "epoch": 0.16013259767506102, "grad_norm": 0.267578125, "grad_norm_var": 0.000691986083984375, "learning_rate": 0.0001, "loss": 1.3063, "loss/crossentropy": 2.483142375946045, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16955678910017014, "step": 10724 }, { "epoch": 0.160147529845676, "grad_norm": 0.31640625, "grad_norm_var": 0.00061187744140625, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.8287787437438965, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20920515060424805, "step": 10725 }, { "epoch": 0.160162462016291, "grad_norm": 0.29296875, "grad_norm_var": 0.0006146589914957682, "learning_rate": 0.0001, "loss": 1.4311, "loss/crossentropy": 2.601547122001648, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2123778760433197, "step": 10726 }, { "epoch": 0.160177394186906, "grad_norm": 0.298828125, "grad_norm_var": 0.00061492919921875, "learning_rate": 0.0001, "loss": 1.5099, "loss/crossentropy": 2.7299246788024902, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22474072873592377, "step": 10727 }, { "epoch": 0.16019232635752095, "grad_norm": 0.314453125, "grad_norm_var": 0.000601816177368164, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.48542582988739, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17464132606983185, "step": 10728 }, { "epoch": 0.16020725852813594, "grad_norm": 0.52734375, "grad_norm_var": 0.0037286758422851564, "learning_rate": 0.0001, "loss": 1.7117, "loss/crossentropy": 2.1079294681549072, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.20780018717050552, "step": 10729 }, { "epoch": 0.16022219069875093, "grad_norm": 0.291015625, "grad_norm_var": 0.0037144978841145834, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.6459062099456787, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2290285900235176, "step": 10730 }, { "epoch": 0.1602371228693659, "grad_norm": 0.310546875, "grad_norm_var": 0.0036401748657226562, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.635981798171997, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2289353758096695, "step": 10731 }, { "epoch": 0.16025205503998088, "grad_norm": 0.32421875, "grad_norm_var": 0.003639984130859375, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.699217915534973, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23173953592777252, "step": 10732 }, { "epoch": 0.16026698721059587, "grad_norm": 0.337890625, "grad_norm_var": 0.003517595926920573, "learning_rate": 0.0001, "loss": 1.6274, "loss/crossentropy": 2.727424144744873, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.27979158610105515, "step": 10733 }, { "epoch": 0.16028191938121086, "grad_norm": 0.384765625, "grad_norm_var": 0.0038010756174723308, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.4528605937957764, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.21483412384986877, "step": 10734 }, { "epoch": 0.16029685155182583, "grad_norm": 0.35546875, "grad_norm_var": 0.0037663777669270835, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.3442596197128296, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.17782744765281677, "step": 10735 }, { "epoch": 0.16031178372244081, "grad_norm": 0.32421875, "grad_norm_var": 0.003763182957967122, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.3128104209899902, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17224295437335968, "step": 10736 }, { "epoch": 0.1603267158930558, "grad_norm": 0.28515625, "grad_norm_var": 0.0037291844685872397, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.6163567304611206, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1826803758740425, "step": 10737 }, { "epoch": 0.16034164806367077, "grad_norm": 0.32421875, "grad_norm_var": 0.0036617120107014975, "learning_rate": 0.0001, "loss": 1.3182, "loss/crossentropy": 2.565077781677246, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16975454986095428, "step": 10738 }, { "epoch": 0.16035658023428576, "grad_norm": 0.34765625, "grad_norm_var": 0.0035739739735921225, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.5609982013702393, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2209559679031372, "step": 10739 }, { "epoch": 0.16037151240490075, "grad_norm": 0.3125, "grad_norm_var": 0.003317705790201823, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.710124135017395, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16426821798086166, "step": 10740 }, { "epoch": 0.1603864445755157, "grad_norm": 0.376953125, "grad_norm_var": 0.0034029483795166016, "learning_rate": 0.0001, "loss": 1.6216, "loss/crossentropy": 2.548937439918518, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.24662795662879944, "step": 10741 }, { "epoch": 0.1604013767461307, "grad_norm": 0.39453125, "grad_norm_var": 0.003437662124633789, "learning_rate": 0.0001, "loss": 1.6101, "loss/crossentropy": 2.5764771699905396, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2624489665031433, "step": 10742 }, { "epoch": 0.1604163089167457, "grad_norm": 0.34765625, "grad_norm_var": 0.003290239969889323, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.553961753845215, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2075032740831375, "step": 10743 }, { "epoch": 0.16043124108736068, "grad_norm": 0.3515625, "grad_norm_var": 0.003213230768839518, "learning_rate": 0.0001, "loss": 1.6126, "loss/crossentropy": 2.675787091255188, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.25711962580680847, "step": 10744 }, { "epoch": 0.16044617325797564, "grad_norm": 0.322265625, "grad_norm_var": 0.0009852091471354166, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.2930359840393066, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18335489183664322, "step": 10745 }, { "epoch": 0.16046110542859063, "grad_norm": 0.455078125, "grad_norm_var": 0.0016634623209635417, "learning_rate": 0.0001, "loss": 1.6252, "loss/crossentropy": 2.496026396751404, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2618815526366234, "step": 10746 }, { "epoch": 0.16047603759920562, "grad_norm": 0.392578125, "grad_norm_var": 0.001683489481608073, "learning_rate": 0.0001, "loss": 1.4917, "loss/crossentropy": 2.6443374156951904, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2065274715423584, "step": 10747 }, { "epoch": 0.16049096976982058, "grad_norm": 0.28515625, "grad_norm_var": 0.0019250869750976562, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.7599329948425293, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2016872838139534, "step": 10748 }, { "epoch": 0.16050590194043557, "grad_norm": 0.361328125, "grad_norm_var": 0.0019220352172851563, "learning_rate": 0.0001, "loss": 1.69, "loss/crossentropy": 2.35078227519989, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2446979582309723, "step": 10749 }, { "epoch": 0.16052083411105056, "grad_norm": 0.380859375, "grad_norm_var": 0.0019055684407552084, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.33239209651947, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.21475668251514435, "step": 10750 }, { "epoch": 0.16053576628166552, "grad_norm": 0.32421875, "grad_norm_var": 0.0019482930501302084, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.7379567623138428, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20750252157449722, "step": 10751 }, { "epoch": 0.1605506984522805, "grad_norm": 0.318359375, "grad_norm_var": 0.00196989377339681, "learning_rate": 0.0001, "loss": 1.5518, "loss/crossentropy": 2.578841209411621, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2276071012020111, "step": 10752 }, { "epoch": 0.1605656306228955, "grad_norm": 0.30078125, "grad_norm_var": 0.0018526554107666016, "learning_rate": 0.0001, "loss": 1.291, "loss/crossentropy": 2.5693122148513794, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1621345579624176, "step": 10753 }, { "epoch": 0.1605805627935105, "grad_norm": 0.30859375, "grad_norm_var": 0.00192106564839681, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.6761852502822876, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21268774569034576, "step": 10754 }, { "epoch": 0.16059549496412545, "grad_norm": 0.267578125, "grad_norm_var": 0.0023335774739583334, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.56063711643219, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1653973013162613, "step": 10755 }, { "epoch": 0.16061042713474044, "grad_norm": 0.27734375, "grad_norm_var": 0.0025573094685872396, "learning_rate": 0.0001, "loss": 1.3193, "loss/crossentropy": 2.6439725160598755, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17872312664985657, "step": 10756 }, { "epoch": 0.16062535930535543, "grad_norm": 0.375, "grad_norm_var": 0.002548329035441081, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.6271417140960693, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.24103188514709473, "step": 10757 }, { "epoch": 0.1606402914759704, "grad_norm": 0.283203125, "grad_norm_var": 0.0025347391764322918, "learning_rate": 0.0001, "loss": 1.3321, "loss/crossentropy": 2.3734270334243774, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.16805704683065414, "step": 10758 }, { "epoch": 0.16065522364658538, "grad_norm": 0.291015625, "grad_norm_var": 0.0026356856028238933, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.7320412397384644, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16932151466608047, "step": 10759 }, { "epoch": 0.16067015581720037, "grad_norm": 0.291015625, "grad_norm_var": 0.0026982625325520835, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.65306293964386, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19534583389759064, "step": 10760 }, { "epoch": 0.16068508798781536, "grad_norm": 0.314453125, "grad_norm_var": 0.0027071634928385415, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.763045310974121, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20646657049655914, "step": 10761 }, { "epoch": 0.16070002015843032, "grad_norm": 0.26953125, "grad_norm_var": 0.0016818841298421223, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.4255107641220093, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1764136180281639, "step": 10762 }, { "epoch": 0.1607149523290453, "grad_norm": 0.29296875, "grad_norm_var": 0.0012725194295247395, "learning_rate": 0.0001, "loss": 1.2671, "loss/crossentropy": 2.46523118019104, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.18111592531204224, "step": 10763 }, { "epoch": 0.1607298844996603, "grad_norm": 0.310546875, "grad_norm_var": 0.0012326399485270182, "learning_rate": 0.0001, "loss": 1.4388, "loss/crossentropy": 2.4867933988571167, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19275201857089996, "step": 10764 }, { "epoch": 0.16074481667027526, "grad_norm": 0.298828125, "grad_norm_var": 0.0010525862375895183, "learning_rate": 0.0001, "loss": 1.463, "loss/crossentropy": 2.5177313089370728, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21303896605968475, "step": 10765 }, { "epoch": 0.16075974884089025, "grad_norm": 0.283203125, "grad_norm_var": 0.0006806532541910808, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.7675105333328247, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1939534693956375, "step": 10766 }, { "epoch": 0.16077468101150524, "grad_norm": 0.302734375, "grad_norm_var": 0.0006413141886393229, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.5000261068344116, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20602020621299744, "step": 10767 }, { "epoch": 0.1607896131821202, "grad_norm": 0.28515625, "grad_norm_var": 0.0006248315175374348, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.7351564168930054, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1958358883857727, "step": 10768 }, { "epoch": 0.1608045453527352, "grad_norm": 0.318359375, "grad_norm_var": 0.0006530125935872396, "learning_rate": 0.0001, "loss": 1.4862, "loss/crossentropy": 2.683825969696045, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21274691075086594, "step": 10769 }, { "epoch": 0.16081947752335018, "grad_norm": 0.296875, "grad_norm_var": 0.0006451924641927083, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.592376947402954, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1892354041337967, "step": 10770 }, { "epoch": 0.16083440969396517, "grad_norm": 0.369140625, "grad_norm_var": 0.00088653564453125, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.5362573862075806, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.18700328469276428, "step": 10771 }, { "epoch": 0.16084934186458014, "grad_norm": 0.349609375, "grad_norm_var": 0.0009588718414306641, "learning_rate": 0.0001, "loss": 1.4081, "loss/crossentropy": 2.5909985303878784, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19328201562166214, "step": 10772 }, { "epoch": 0.16086427403519513, "grad_norm": 0.412109375, "grad_norm_var": 0.0013753255208333334, "learning_rate": 0.0001, "loss": 1.5491, "loss/crossentropy": 2.7712186574935913, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22492721676826477, "step": 10773 }, { "epoch": 0.16087920620581012, "grad_norm": 0.3515625, "grad_norm_var": 0.001418161392211914, "learning_rate": 0.0001, "loss": 1.5054, "loss/crossentropy": 2.4931752681732178, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21637600660324097, "step": 10774 }, { "epoch": 0.16089413837642508, "grad_norm": 0.2890625, "grad_norm_var": 0.0014245986938476562, "learning_rate": 0.0001, "loss": 1.3149, "loss/crossentropy": 2.6500707864761353, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1820617914199829, "step": 10775 }, { "epoch": 0.16090907054704007, "grad_norm": 0.2890625, "grad_norm_var": 0.0014310042063395182, "learning_rate": 0.0001, "loss": 1.5173, "loss/crossentropy": 2.5204145908355713, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23216760158538818, "step": 10776 }, { "epoch": 0.16092400271765506, "grad_norm": 0.27734375, "grad_norm_var": 0.0015176773071289063, "learning_rate": 0.0001, "loss": 1.3687, "loss/crossentropy": 2.565529942512512, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18904365599155426, "step": 10777 }, { "epoch": 0.16093893488827005, "grad_norm": 0.283203125, "grad_norm_var": 0.0014514764149983724, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.532145380973816, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17930983006954193, "step": 10778 }, { "epoch": 0.160953867058885, "grad_norm": 0.287109375, "grad_norm_var": 0.0014693578084309896, "learning_rate": 0.0001, "loss": 1.289, "loss/crossentropy": 2.7521040439605713, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17181231081485748, "step": 10779 }, { "epoch": 0.1609687992295, "grad_norm": 0.384765625, "grad_norm_var": 0.0017918904622395834, "learning_rate": 0.0001, "loss": 1.7173, "loss/crossentropy": 2.3053231239318848, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.2563590630888939, "step": 10780 }, { "epoch": 0.160983731400115, "grad_norm": 0.330078125, "grad_norm_var": 0.0017756144205729166, "learning_rate": 0.0001, "loss": 1.4987, "loss/crossentropy": 2.3458417654037476, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2057371512055397, "step": 10781 }, { "epoch": 0.16099866357072995, "grad_norm": 0.37109375, "grad_norm_var": 0.0018349806467692056, "learning_rate": 0.0001, "loss": 1.6771, "loss/crossentropy": 2.5006065368652344, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.2590993344783783, "step": 10782 }, { "epoch": 0.16101359574134494, "grad_norm": 0.30859375, "grad_norm_var": 0.0018198649088541667, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.497784376144409, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2028454691171646, "step": 10783 }, { "epoch": 0.16102852791195993, "grad_norm": 0.404296875, "grad_norm_var": 0.0020709832509358725, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.457517385482788, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20060446858406067, "step": 10784 }, { "epoch": 0.1610434600825749, "grad_norm": 0.294921875, "grad_norm_var": 0.0021499474843343098, "learning_rate": 0.0001, "loss": 1.4089, "loss/crossentropy": 2.5740281343460083, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19791749119758606, "step": 10785 }, { "epoch": 0.16105839225318988, "grad_norm": 0.326171875, "grad_norm_var": 0.0020696004231770832, "learning_rate": 0.0001, "loss": 1.5346, "loss/crossentropy": 2.734947085380554, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23773249983787537, "step": 10786 }, { "epoch": 0.16107332442380487, "grad_norm": 0.333984375, "grad_norm_var": 0.0019774754842122396, "learning_rate": 0.0001, "loss": 1.3865, "loss/crossentropy": 2.6367220878601074, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.20683488249778748, "step": 10787 }, { "epoch": 0.16108825659441986, "grad_norm": 0.28125, "grad_norm_var": 0.002098194758097331, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.7016541957855225, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17726466059684753, "step": 10788 }, { "epoch": 0.16110318876503482, "grad_norm": 0.306640625, "grad_norm_var": 0.0015900770823160807, "learning_rate": 0.0001, "loss": 1.5259, "loss/crossentropy": 2.655067563056946, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.23685741424560547, "step": 10789 }, { "epoch": 0.1611181209356498, "grad_norm": 0.2734375, "grad_norm_var": 0.0016422112782796224, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.6320812702178955, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18355047702789307, "step": 10790 }, { "epoch": 0.1611330531062648, "grad_norm": 0.3359375, "grad_norm_var": 0.0016170342763264973, "learning_rate": 0.0001, "loss": 1.4483, "loss/crossentropy": 2.460936665534973, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20221949368715286, "step": 10791 }, { "epoch": 0.16114798527687976, "grad_norm": 0.3046875, "grad_norm_var": 0.0015720208485921225, "learning_rate": 0.0001, "loss": 1.5391, "loss/crossentropy": 2.578931212425232, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22664441913366318, "step": 10792 }, { "epoch": 0.16116291744749475, "grad_norm": 0.287109375, "grad_norm_var": 0.0015237808227539062, "learning_rate": 0.0001, "loss": 1.4014, "loss/crossentropy": 2.495506525039673, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2021643966436386, "step": 10793 }, { "epoch": 0.16117784961810974, "grad_norm": 0.30078125, "grad_norm_var": 0.0014578342437744141, "learning_rate": 0.0001, "loss": 1.2966, "loss/crossentropy": 2.609887719154358, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1677282229065895, "step": 10794 }, { "epoch": 0.16119278178872473, "grad_norm": 0.3203125, "grad_norm_var": 0.0013781229654947916, "learning_rate": 0.0001, "loss": 1.4703, "loss/crossentropy": 2.5898821353912354, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.212453231215477, "step": 10795 }, { "epoch": 0.1612077139593397, "grad_norm": 0.29296875, "grad_norm_var": 0.001145792007446289, "learning_rate": 0.0001, "loss": 1.3777, "loss/crossentropy": 2.6854931116104126, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19406269490718842, "step": 10796 }, { "epoch": 0.16122264612995468, "grad_norm": 0.287109375, "grad_norm_var": 0.001186354955037435, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.60287082195282, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22861018031835556, "step": 10797 }, { "epoch": 0.16123757830056967, "grad_norm": 0.283203125, "grad_norm_var": 0.0010039647420247396, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.505370616912842, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21478411555290222, "step": 10798 }, { "epoch": 0.16125251047118463, "grad_norm": 0.287109375, "grad_norm_var": 0.0010335127512613933, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.7286936044692993, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19708426296710968, "step": 10799 }, { "epoch": 0.16126744264179962, "grad_norm": 0.287109375, "grad_norm_var": 0.0003792921702067057, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.737056255340576, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1786559820175171, "step": 10800 }, { "epoch": 0.16128237481241461, "grad_norm": 0.416015625, "grad_norm_var": 0.0012110233306884765, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.4883254766464233, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.18446356058120728, "step": 10801 }, { "epoch": 0.16129730698302958, "grad_norm": 0.314453125, "grad_norm_var": 0.0011908054351806641, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.789241909980774, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1877039149403572, "step": 10802 }, { "epoch": 0.16131223915364457, "grad_norm": 0.31640625, "grad_norm_var": 0.0011468887329101562, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.493985414505005, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20276354253292084, "step": 10803 }, { "epoch": 0.16132717132425956, "grad_norm": 0.28515625, "grad_norm_var": 0.0011349995930989584, "learning_rate": 0.0001, "loss": 1.3076, "loss/crossentropy": 2.613509178161621, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17089711874723434, "step": 10804 }, { "epoch": 0.16134210349487454, "grad_norm": 0.31640625, "grad_norm_var": 0.0011415958404541015, "learning_rate": 0.0001, "loss": 1.5517, "loss/crossentropy": 2.554261803627014, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23527522385120392, "step": 10805 }, { "epoch": 0.1613570356654895, "grad_norm": 0.3359375, "grad_norm_var": 0.0011080265045166015, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.5330464839935303, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2165200263261795, "step": 10806 }, { "epoch": 0.1613719678361045, "grad_norm": 0.359375, "grad_norm_var": 0.0012213230133056641, "learning_rate": 0.0001, "loss": 1.7475, "loss/crossentropy": 2.436341643333435, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.2905137538909912, "step": 10807 }, { "epoch": 0.16138690000671949, "grad_norm": 0.33984375, "grad_norm_var": 0.0012636661529541015, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.5526448488235474, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1925204172730446, "step": 10808 }, { "epoch": 0.16140183217733445, "grad_norm": 0.328125, "grad_norm_var": 0.001219940185546875, "learning_rate": 0.0001, "loss": 1.5671, "loss/crossentropy": 2.5416553020477295, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.23119332641363144, "step": 10809 }, { "epoch": 0.16141676434794944, "grad_norm": 0.30078125, "grad_norm_var": 0.001219940185546875, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.595625638961792, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17439556866884232, "step": 10810 }, { "epoch": 0.16143169651856443, "grad_norm": 0.318359375, "grad_norm_var": 0.0012192885080973307, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.4801559448242188, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2060721218585968, "step": 10811 }, { "epoch": 0.1614466286891794, "grad_norm": 0.376953125, "grad_norm_var": 0.001393572489420573, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.502204418182373, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2381325513124466, "step": 10812 }, { "epoch": 0.16146156085979438, "grad_norm": 0.341796875, "grad_norm_var": 0.0013259251912434895, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.6144338846206665, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22622404992580414, "step": 10813 }, { "epoch": 0.16147649303040937, "grad_norm": 0.3125, "grad_norm_var": 0.0012145837148030598, "learning_rate": 0.0001, "loss": 1.5034, "loss/crossentropy": 2.409191846847534, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22211821377277374, "step": 10814 }, { "epoch": 0.16149142520102436, "grad_norm": 0.326171875, "grad_norm_var": 0.001100778579711914, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.5483970642089844, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20673313736915588, "step": 10815 }, { "epoch": 0.16150635737163932, "grad_norm": 0.291015625, "grad_norm_var": 0.001079543431599935, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.710378050804138, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1902637854218483, "step": 10816 }, { "epoch": 0.1615212895422543, "grad_norm": 0.365234375, "grad_norm_var": 0.0006580193837483724, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.621523380279541, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2367386668920517, "step": 10817 }, { "epoch": 0.1615362217128693, "grad_norm": 0.337890625, "grad_norm_var": 0.0006538232167561849, "learning_rate": 0.0001, "loss": 1.4614, "loss/crossentropy": 2.422270894050598, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.1957930251955986, "step": 10818 }, { "epoch": 0.16155115388348426, "grad_norm": 0.26953125, "grad_norm_var": 0.0008651574452718098, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.6445049047470093, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1787785440683365, "step": 10819 }, { "epoch": 0.16156608605409925, "grad_norm": 0.3125, "grad_norm_var": 0.0007654666900634765, "learning_rate": 0.0001, "loss": 1.5249, "loss/crossentropy": 2.6366037130355835, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2279811054468155, "step": 10820 }, { "epoch": 0.16158101822471424, "grad_norm": 0.31640625, "grad_norm_var": 0.0007654666900634765, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.4496649503707886, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.21316781640052795, "step": 10821 }, { "epoch": 0.16159595039532923, "grad_norm": 0.330078125, "grad_norm_var": 0.000760650634765625, "learning_rate": 0.0001, "loss": 1.5758, "loss/crossentropy": 2.526779890060425, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23592828214168549, "step": 10822 }, { "epoch": 0.1616108825659442, "grad_norm": 0.296875, "grad_norm_var": 0.0007321675618489583, "learning_rate": 0.0001, "loss": 1.352, "loss/crossentropy": 2.3537343740463257, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18008534610271454, "step": 10823 }, { "epoch": 0.16162581473655918, "grad_norm": 0.302734375, "grad_norm_var": 0.0007336775461832683, "learning_rate": 0.0001, "loss": 1.4107, "loss/crossentropy": 2.891109824180603, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19192134588956833, "step": 10824 }, { "epoch": 0.16164074690717417, "grad_norm": 0.3515625, "grad_norm_var": 0.0007920424143473307, "learning_rate": 0.0001, "loss": 1.5655, "loss/crossentropy": 2.7113430500030518, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23739425837993622, "step": 10825 }, { "epoch": 0.16165567907778913, "grad_norm": 0.283203125, "grad_norm_var": 0.0008608500162760416, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.650458335876465, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1944868192076683, "step": 10826 }, { "epoch": 0.16167061124840412, "grad_norm": 0.263671875, "grad_norm_var": 0.0010655721028645833, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.4701532125473022, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1920766904950142, "step": 10827 }, { "epoch": 0.1616855434190191, "grad_norm": 0.376953125, "grad_norm_var": 0.0010655721028645833, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.5239923000335693, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.22937792539596558, "step": 10828 }, { "epoch": 0.16170047558963407, "grad_norm": 0.33203125, "grad_norm_var": 0.001039743423461914, "learning_rate": 0.0001, "loss": 1.5052, "loss/crossentropy": 2.9446974992752075, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21218927949666977, "step": 10829 }, { "epoch": 0.16171540776024906, "grad_norm": 0.306640625, "grad_norm_var": 0.00104522705078125, "learning_rate": 0.0001, "loss": 1.5941, "loss/crossentropy": 2.6062188148498535, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2503935396671295, "step": 10830 }, { "epoch": 0.16173033993086405, "grad_norm": 0.37890625, "grad_norm_var": 0.001287698745727539, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.590059995651245, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.23755213618278503, "step": 10831 }, { "epoch": 0.16174527210147904, "grad_norm": 0.2578125, "grad_norm_var": 0.001483599344889323, "learning_rate": 0.0001, "loss": 1.2536, "loss/crossentropy": 2.628219246864319, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.15986726433038712, "step": 10832 }, { "epoch": 0.161760204272094, "grad_norm": 0.298828125, "grad_norm_var": 0.0013376871744791667, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.7147974967956543, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.23732832074165344, "step": 10833 }, { "epoch": 0.161775136442709, "grad_norm": 0.32421875, "grad_norm_var": 0.0013048648834228516, "learning_rate": 0.0001, "loss": 1.4891, "loss/crossentropy": 2.4508031606674194, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20390286296606064, "step": 10834 }, { "epoch": 0.16179006861332398, "grad_norm": 0.439453125, "grad_norm_var": 0.0021331787109375, "learning_rate": 0.0001, "loss": 1.9734, "loss/crossentropy": 2.278685510158539, "loss/fcd": 1.640625, "loss/idx": 11.0, "loss/logits": 0.33275844901800156, "step": 10835 }, { "epoch": 0.16180500078393895, "grad_norm": 0.314453125, "grad_norm_var": 0.0021306196848551433, "learning_rate": 0.0001, "loss": 1.4578, "loss/crossentropy": 2.75877583026886, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20784759521484375, "step": 10836 }, { "epoch": 0.16181993295455394, "grad_norm": 0.3046875, "grad_norm_var": 0.002150074640909831, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.6598325967788696, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18672916293144226, "step": 10837 }, { "epoch": 0.16183486512516893, "grad_norm": 0.28515625, "grad_norm_var": 0.002231597900390625, "learning_rate": 0.0001, "loss": 1.517, "loss/crossentropy": 2.421264886856079, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22015909105539322, "step": 10838 }, { "epoch": 0.16184979729578391, "grad_norm": 0.3359375, "grad_norm_var": 0.0022074381510416665, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.6774051189422607, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19253858923912048, "step": 10839 }, { "epoch": 0.16186472946639888, "grad_norm": 0.33203125, "grad_norm_var": 0.002184788386027018, "learning_rate": 0.0001, "loss": 1.3471, "loss/crossentropy": 2.866185188293457, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1830621063709259, "step": 10840 }, { "epoch": 0.16187966163701387, "grad_norm": 0.2890625, "grad_norm_var": 0.002200047175089518, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.612982988357544, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18563804030418396, "step": 10841 }, { "epoch": 0.16189459380762886, "grad_norm": 0.302734375, "grad_norm_var": 0.002127567927042643, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.5243135690689087, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19553114473819733, "step": 10842 }, { "epoch": 0.16190952597824382, "grad_norm": 0.34375, "grad_norm_var": 0.0019118626912434896, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.4075846672058105, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.19891320914030075, "step": 10843 }, { "epoch": 0.1619244581488588, "grad_norm": 0.3125, "grad_norm_var": 0.0017371972401936848, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.6427786350250244, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.21861770004034042, "step": 10844 }, { "epoch": 0.1619393903194738, "grad_norm": 0.291015625, "grad_norm_var": 0.0017896016438802083, "learning_rate": 0.0001, "loss": 1.4155, "loss/crossentropy": 2.6041375398635864, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20847713202238083, "step": 10845 }, { "epoch": 0.16195432249008876, "grad_norm": 0.3046875, "grad_norm_var": 0.0017932732899983725, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.4458638429641724, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2156554013490677, "step": 10846 }, { "epoch": 0.16196925466070375, "grad_norm": 0.294921875, "grad_norm_var": 0.0015711466471354167, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.5677733421325684, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1906270608305931, "step": 10847 }, { "epoch": 0.16198418683131874, "grad_norm": 0.314453125, "grad_norm_var": 0.0013439019521077475, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.5023282766342163, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.20822738111019135, "step": 10848 }, { "epoch": 0.16199911900193373, "grad_norm": 0.267578125, "grad_norm_var": 0.0014847914377848308, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.449618697166443, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16528860479593277, "step": 10849 }, { "epoch": 0.1620140511725487, "grad_norm": 0.29296875, "grad_norm_var": 0.0015117486317952475, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.6069557666778564, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1577582061290741, "step": 10850 }, { "epoch": 0.16202898334316368, "grad_norm": 0.259765625, "grad_norm_var": 0.0005261580149332683, "learning_rate": 0.0001, "loss": 1.3245, "loss/crossentropy": 2.5451282262802124, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18387408554553986, "step": 10851 }, { "epoch": 0.16204391551377867, "grad_norm": 0.458984375, "grad_norm_var": 0.0020552158355712892, "learning_rate": 0.0001, "loss": 2.3521, "loss/crossentropy": 2.4944037199020386, "loss/fcd": 1.953125, "loss/idx": 11.0, "loss/logits": 0.39896416664123535, "step": 10852 }, { "epoch": 0.16205884768439363, "grad_norm": 0.32421875, "grad_norm_var": 0.0020603020985921223, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.6768118143081665, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20911621302366257, "step": 10853 }, { "epoch": 0.16207377985500862, "grad_norm": 0.27734375, "grad_norm_var": 0.002093235651652018, "learning_rate": 0.0001, "loss": 1.3387, "loss/crossentropy": 2.625655770301819, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18638097494840622, "step": 10854 }, { "epoch": 0.1620887120256236, "grad_norm": 0.318359375, "grad_norm_var": 0.0020579020182291668, "learning_rate": 0.0001, "loss": 1.5016, "loss/crossentropy": 2.7730478048324585, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22039803117513657, "step": 10855 }, { "epoch": 0.1621036441962386, "grad_norm": 0.37109375, "grad_norm_var": 0.0022600809733072918, "learning_rate": 0.0001, "loss": 1.5605, "loss/crossentropy": 2.665923237800598, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2362813875079155, "step": 10856 }, { "epoch": 0.16211857636685356, "grad_norm": 0.48828125, "grad_norm_var": 0.004079119364420573, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.6085193157196045, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20310890674591064, "step": 10857 }, { "epoch": 0.16213350853746855, "grad_norm": 0.283203125, "grad_norm_var": 0.004164632161458333, "learning_rate": 0.0001, "loss": 1.4117, "loss/crossentropy": 2.417849898338318, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20075052976608276, "step": 10858 }, { "epoch": 0.16214844070808354, "grad_norm": 0.294921875, "grad_norm_var": 0.004192845026652018, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.6046983003616333, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21571959555149078, "step": 10859 }, { "epoch": 0.1621633728786985, "grad_norm": 0.30859375, "grad_norm_var": 0.004198821385701498, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.4381603002548218, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17808844149112701, "step": 10860 }, { "epoch": 0.1621783050493135, "grad_norm": 0.27734375, "grad_norm_var": 0.004266802469889323, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.518872380256653, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2226748764514923, "step": 10861 }, { "epoch": 0.16219323721992848, "grad_norm": 0.349609375, "grad_norm_var": 0.004294951756795247, "learning_rate": 0.0001, "loss": 1.4429, "loss/crossentropy": 2.6796798706054688, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1929120570421219, "step": 10862 }, { "epoch": 0.16220816939054344, "grad_norm": 0.27734375, "grad_norm_var": 0.004382069905598958, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.6781238317489624, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1898316666483879, "step": 10863 }, { "epoch": 0.16222310156115843, "grad_norm": 0.30078125, "grad_norm_var": 0.004408884048461914, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.7235299348831177, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19807519018650055, "step": 10864 }, { "epoch": 0.16223803373177342, "grad_norm": 0.408203125, "grad_norm_var": 0.004626321792602539, "learning_rate": 0.0001, "loss": 1.7701, "loss/crossentropy": 2.3894591331481934, "loss/fcd": 1.52734375, "loss/idx": 11.0, "loss/logits": 0.24273645877838135, "step": 10865 }, { "epoch": 0.1622529659023884, "grad_norm": 0.287109375, "grad_norm_var": 0.004657936096191406, "learning_rate": 0.0001, "loss": 1.2544, "loss/crossentropy": 2.479996919631958, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.16068702191114426, "step": 10866 }, { "epoch": 0.16226789807300337, "grad_norm": 0.326171875, "grad_norm_var": 0.004308827718098958, "learning_rate": 0.0001, "loss": 1.5859, "loss/crossentropy": 2.8321008682250977, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.25383830070495605, "step": 10867 }, { "epoch": 0.16228283024361836, "grad_norm": 0.322265625, "grad_norm_var": 0.0032073338826497395, "learning_rate": 0.0001, "loss": 1.4662, "loss/crossentropy": 2.8057050704956055, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21617528051137924, "step": 10868 }, { "epoch": 0.16229776241423335, "grad_norm": 0.396484375, "grad_norm_var": 0.003517262140909831, "learning_rate": 0.0001, "loss": 1.6587, "loss/crossentropy": 2.301467001438141, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.24461417645215988, "step": 10869 }, { "epoch": 0.16231269458484832, "grad_norm": 0.34765625, "grad_norm_var": 0.0033284346262613933, "learning_rate": 0.0001, "loss": 1.4932, "loss/crossentropy": 2.584107518196106, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22370723634958267, "step": 10870 }, { "epoch": 0.1623276267554633, "grad_norm": 0.26953125, "grad_norm_var": 0.003584734598795573, "learning_rate": 0.0001, "loss": 1.367, "loss/crossentropy": 2.614268183708191, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1951601430773735, "step": 10871 }, { "epoch": 0.1623425589260783, "grad_norm": 0.326171875, "grad_norm_var": 0.0034754276275634766, "learning_rate": 0.0001, "loss": 1.3602, "loss/crossentropy": 2.6456456184387207, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18048050999641418, "step": 10872 }, { "epoch": 0.16235749109669326, "grad_norm": 0.421875, "grad_norm_var": 0.002340555191040039, "learning_rate": 0.0001, "loss": 1.5058, "loss/crossentropy": 2.671826958656311, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21673311293125153, "step": 10873 }, { "epoch": 0.16237242326730825, "grad_norm": 0.28515625, "grad_norm_var": 0.0023299535115559895, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.6899794340133667, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1707722395658493, "step": 10874 }, { "epoch": 0.16238735543792324, "grad_norm": 0.333984375, "grad_norm_var": 0.0022689183553059895, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.4445247650146484, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.23471909761428833, "step": 10875 }, { "epoch": 0.16240228760853823, "grad_norm": 0.2890625, "grad_norm_var": 0.0023417154947916667, "learning_rate": 0.0001, "loss": 1.4071, "loss/crossentropy": 2.516788363456726, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20400236546993256, "step": 10876 }, { "epoch": 0.1624172197791532, "grad_norm": 0.294921875, "grad_norm_var": 0.0022465864817301433, "learning_rate": 0.0001, "loss": 1.5835, "loss/crossentropy": 2.26416152715683, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.24755969643592834, "step": 10877 }, { "epoch": 0.16243215194976818, "grad_norm": 0.359375, "grad_norm_var": 0.002281634012858073, "learning_rate": 0.0001, "loss": 1.5118, "loss/crossentropy": 2.7314059734344482, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23443793505430222, "step": 10878 }, { "epoch": 0.16244708412038317, "grad_norm": 0.2734375, "grad_norm_var": 0.0023089090983072916, "learning_rate": 0.0001, "loss": 1.2933, "loss/crossentropy": 2.587040901184082, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17217734456062317, "step": 10879 }, { "epoch": 0.16246201629099813, "grad_norm": 0.322265625, "grad_norm_var": 0.0022608280181884766, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.5687466859817505, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19919102638959885, "step": 10880 }, { "epoch": 0.16247694846161312, "grad_norm": 0.29296875, "grad_norm_var": 0.0018735249837239583, "learning_rate": 0.0001, "loss": 1.485, "loss/crossentropy": 2.555764675140381, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21157249808311462, "step": 10881 }, { "epoch": 0.1624918806322281, "grad_norm": 0.291015625, "grad_norm_var": 0.0018564224243164062, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.6109336614608765, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1780364215373993, "step": 10882 }, { "epoch": 0.1625068128028431, "grad_norm": 0.287109375, "grad_norm_var": 0.0019301732381184896, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.7423810958862305, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20464959740638733, "step": 10883 }, { "epoch": 0.16252174497345806, "grad_norm": 0.33984375, "grad_norm_var": 0.0019557793935139974, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.656746029853821, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21799461543560028, "step": 10884 }, { "epoch": 0.16253667714407305, "grad_norm": 0.29296875, "grad_norm_var": 0.0015792210896809896, "learning_rate": 0.0001, "loss": 1.388, "loss/crossentropy": 2.278515040874481, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.17703945934772491, "step": 10885 }, { "epoch": 0.16255160931468804, "grad_norm": 0.267578125, "grad_norm_var": 0.0016228834788004558, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.7168467044830322, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19379620254039764, "step": 10886 }, { "epoch": 0.162566541485303, "grad_norm": 0.31640625, "grad_norm_var": 0.0015122572580973308, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.7703120708465576, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.23233607411384583, "step": 10887 }, { "epoch": 0.162581473655918, "grad_norm": 0.2734375, "grad_norm_var": 0.001587359110514323, "learning_rate": 0.0001, "loss": 1.2651, "loss/crossentropy": 2.562964916229248, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16748306155204773, "step": 10888 }, { "epoch": 0.16259640582653298, "grad_norm": 0.396484375, "grad_norm_var": 0.001244974136352539, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.4536800384521484, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19481844455003738, "step": 10889 }, { "epoch": 0.16261133799714794, "grad_norm": 0.3046875, "grad_norm_var": 0.0012112776438395181, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.4900089502334595, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2231353521347046, "step": 10890 }, { "epoch": 0.16262627016776293, "grad_norm": 0.298828125, "grad_norm_var": 0.0011689345041910808, "learning_rate": 0.0001, "loss": 1.4398, "loss/crossentropy": 2.5071682929992676, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2132306545972824, "step": 10891 }, { "epoch": 0.16264120233837792, "grad_norm": 0.458984375, "grad_norm_var": 0.0025835673014322916, "learning_rate": 0.0001, "loss": 1.6577, "loss/crossentropy": 2.694698452949524, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.2592870816588402, "step": 10892 }, { "epoch": 0.1626561345089929, "grad_norm": 0.353515625, "grad_norm_var": 0.002626482645670573, "learning_rate": 0.0001, "loss": 1.3012, "loss/crossentropy": 2.65848445892334, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16449464857578278, "step": 10893 }, { "epoch": 0.16267106667960787, "grad_norm": 0.32421875, "grad_norm_var": 0.002521769205729167, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.5775821208953857, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19374921917915344, "step": 10894 }, { "epoch": 0.16268599885022286, "grad_norm": 0.431640625, "grad_norm_var": 0.003138462702433268, "learning_rate": 0.0001, "loss": 1.5252, "loss/crossentropy": 2.7647597789764404, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.24789026379585266, "step": 10895 }, { "epoch": 0.16270093102083785, "grad_norm": 0.337890625, "grad_norm_var": 0.0031412601470947265, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.8637856245040894, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2026441991329193, "step": 10896 }, { "epoch": 0.16271586319145281, "grad_norm": 0.359375, "grad_norm_var": 0.003095865249633789, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.7031434774398804, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19785769283771515, "step": 10897 }, { "epoch": 0.1627307953620678, "grad_norm": 0.3203125, "grad_norm_var": 0.0029840469360351562, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.669415235519409, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22710323333740234, "step": 10898 }, { "epoch": 0.1627457275326828, "grad_norm": 0.34765625, "grad_norm_var": 0.0028248945871988933, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.5830475091934204, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18018066883087158, "step": 10899 }, { "epoch": 0.16276065970329778, "grad_norm": 0.369140625, "grad_norm_var": 0.0028818766276041668, "learning_rate": 0.0001, "loss": 1.523, "loss/crossentropy": 2.57088565826416, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22226788848638535, "step": 10900 }, { "epoch": 0.16277559187391274, "grad_norm": 0.33984375, "grad_norm_var": 0.002720133463541667, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.698743224143982, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17801857739686966, "step": 10901 }, { "epoch": 0.16279052404452773, "grad_norm": 0.341796875, "grad_norm_var": 0.002310625712076823, "learning_rate": 0.0001, "loss": 1.4667, "loss/crossentropy": 2.7036794424057007, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22447650134563446, "step": 10902 }, { "epoch": 0.16280545621514272, "grad_norm": 0.36328125, "grad_norm_var": 0.002248064676920573, "learning_rate": 0.0001, "loss": 1.4903, "loss/crossentropy": 2.5179964303970337, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2168646603822708, "step": 10903 }, { "epoch": 0.1628203883857577, "grad_norm": 0.361328125, "grad_norm_var": 0.0018181959788004557, "learning_rate": 0.0001, "loss": 1.5797, "loss/crossentropy": 2.1326085329055786, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.23590397834777832, "step": 10904 }, { "epoch": 0.16283532055637268, "grad_norm": 0.30859375, "grad_norm_var": 0.001836077372233073, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.700379729270935, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2298043966293335, "step": 10905 }, { "epoch": 0.16285025272698767, "grad_norm": 0.326171875, "grad_norm_var": 0.0017313480377197266, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.5079504251480103, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2032540887594223, "step": 10906 }, { "epoch": 0.16286518489760263, "grad_norm": 0.2890625, "grad_norm_var": 0.001807403564453125, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.217496871948242, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19293053448200226, "step": 10907 }, { "epoch": 0.16288011706821762, "grad_norm": 0.400390625, "grad_norm_var": 0.0011865615844726563, "learning_rate": 0.0001, "loss": 1.6424, "loss/crossentropy": 2.363964080810547, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2595924288034439, "step": 10908 }, { "epoch": 0.1628950492388326, "grad_norm": 0.375, "grad_norm_var": 0.0012300968170166015, "learning_rate": 0.0001, "loss": 1.5637, "loss/crossentropy": 2.610603928565979, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2277267649769783, "step": 10909 }, { "epoch": 0.1629099814094476, "grad_norm": 0.275390625, "grad_norm_var": 0.0015452067057291667, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.5330679416656494, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17801590263843536, "step": 10910 }, { "epoch": 0.16292491358006256, "grad_norm": 0.30859375, "grad_norm_var": 0.0010975996653238933, "learning_rate": 0.0001, "loss": 1.4295, "loss/crossentropy": 2.7876564264297485, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21072101593017578, "step": 10911 }, { "epoch": 0.16293984575067755, "grad_norm": 0.439453125, "grad_norm_var": 0.0017274061838785807, "learning_rate": 0.0001, "loss": 1.4341, "loss/crossentropy": 2.5848976373672485, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19969510287046432, "step": 10912 }, { "epoch": 0.16295477792129254, "grad_norm": 0.326171875, "grad_norm_var": 0.001734161376953125, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.7491865158081055, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18900035321712494, "step": 10913 }, { "epoch": 0.1629697100919075, "grad_norm": 0.328125, "grad_norm_var": 0.0017140706380208333, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.6017225980758667, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18573743104934692, "step": 10914 }, { "epoch": 0.1629846422625225, "grad_norm": 0.3203125, "grad_norm_var": 0.0017465591430664063, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.6418049335479736, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.24606814235448837, "step": 10915 }, { "epoch": 0.16299957443313748, "grad_norm": 0.296875, "grad_norm_var": 0.001811838150024414, "learning_rate": 0.0001, "loss": 1.3379, "loss/crossentropy": 2.4664472341537476, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17380522191524506, "step": 10916 }, { "epoch": 0.16301450660375247, "grad_norm": 0.29296875, "grad_norm_var": 0.0019346714019775391, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.4309587478637695, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1932653710246086, "step": 10917 }, { "epoch": 0.16302943877436743, "grad_norm": 0.35546875, "grad_norm_var": 0.0019594828287760415, "learning_rate": 0.0001, "loss": 1.4074, "loss/crossentropy": 2.3750271797180176, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18867118656635284, "step": 10918 }, { "epoch": 0.16304437094498242, "grad_norm": 0.30859375, "grad_norm_var": 0.0019434611002604166, "learning_rate": 0.0001, "loss": 1.5101, "loss/crossentropy": 2.6369874477386475, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2210213616490364, "step": 10919 }, { "epoch": 0.1630593031155974, "grad_norm": 0.337890625, "grad_norm_var": 0.0018862406412760416, "learning_rate": 0.0001, "loss": 1.4121, "loss/crossentropy": 2.7099462747573853, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1972249448299408, "step": 10920 }, { "epoch": 0.16307423528621237, "grad_norm": 0.3203125, "grad_norm_var": 0.0018604914347330728, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.5123268365859985, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1928723305463791, "step": 10921 }, { "epoch": 0.16308916745682736, "grad_norm": 0.322265625, "grad_norm_var": 0.0018641153971354167, "learning_rate": 0.0001, "loss": 1.4366, "loss/crossentropy": 2.5224486589431763, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20613256841897964, "step": 10922 }, { "epoch": 0.16310409962744235, "grad_norm": 0.46484375, "grad_norm_var": 0.0028111139933268228, "learning_rate": 0.0001, "loss": 1.5906, "loss/crossentropy": 2.4378488063812256, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2429581731557846, "step": 10923 }, { "epoch": 0.1631190317980573, "grad_norm": 0.333984375, "grad_norm_var": 0.0025700887044270834, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.452791690826416, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2010495439171791, "step": 10924 }, { "epoch": 0.1631339639686723, "grad_norm": 0.353515625, "grad_norm_var": 0.0024926344553629558, "learning_rate": 0.0001, "loss": 1.6213, "loss/crossentropy": 2.311464786529541, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.25017618387937546, "step": 10925 }, { "epoch": 0.1631488961392873, "grad_norm": 0.328125, "grad_norm_var": 0.0022364298502604166, "learning_rate": 0.0001, "loss": 1.4655, "loss/crossentropy": 2.8857078552246094, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.23505554348230362, "step": 10926 }, { "epoch": 0.16316382830990228, "grad_norm": 0.359375, "grad_norm_var": 0.002186012268066406, "learning_rate": 0.0001, "loss": 1.3826, "loss/crossentropy": 2.6837291717529297, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18732932209968567, "step": 10927 }, { "epoch": 0.16317876048051724, "grad_norm": 0.412109375, "grad_norm_var": 0.0018811543782552083, "learning_rate": 0.0001, "loss": 1.899, "loss/crossentropy": 2.2418582439422607, "loss/fcd": 1.6171875, "loss/idx": 11.0, "loss/logits": 0.2817782685160637, "step": 10928 }, { "epoch": 0.16319369265113223, "grad_norm": 0.296875, "grad_norm_var": 0.0019939263661702474, "learning_rate": 0.0001, "loss": 1.2074, "loss/crossentropy": 2.542891263961792, "loss/fcd": 1.0625, "loss/idx": 11.0, "loss/logits": 0.14492955803871155, "step": 10929 }, { "epoch": 0.16320862482174722, "grad_norm": 0.35546875, "grad_norm_var": 0.0019992669423421223, "learning_rate": 0.0001, "loss": 1.4387, "loss/crossentropy": 2.442623019218445, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1848168447613716, "step": 10930 }, { "epoch": 0.16322355699236218, "grad_norm": 0.3203125, "grad_norm_var": 0.0019992669423421223, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.6803306341171265, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18874412775039673, "step": 10931 }, { "epoch": 0.16323848916297717, "grad_norm": 0.390625, "grad_norm_var": 0.0019946893056233725, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.3599092960357666, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.2123003676533699, "step": 10932 }, { "epoch": 0.16325342133359216, "grad_norm": 0.341796875, "grad_norm_var": 0.0017916361490885417, "learning_rate": 0.0001, "loss": 1.609, "loss/crossentropy": 2.06281840801239, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2262101024389267, "step": 10933 }, { "epoch": 0.16326835350420713, "grad_norm": 0.376953125, "grad_norm_var": 0.0018358707427978515, "learning_rate": 0.0001, "loss": 1.898, "loss/crossentropy": 2.448893904685974, "loss/fcd": 1.51953125, "loss/idx": 11.0, "loss/logits": 0.37848397344350815, "step": 10934 }, { "epoch": 0.16328328567482212, "grad_norm": 0.384765625, "grad_norm_var": 0.0017633438110351562, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.7045645713806152, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16931641101837158, "step": 10935 }, { "epoch": 0.1632982178454371, "grad_norm": 0.30078125, "grad_norm_var": 0.001940011978149414, "learning_rate": 0.0001, "loss": 1.488, "loss/crossentropy": 2.5810431241989136, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21846744418144226, "step": 10936 }, { "epoch": 0.1633131500160521, "grad_norm": 0.275390625, "grad_norm_var": 0.0022672017415364585, "learning_rate": 0.0001, "loss": 1.3585, "loss/crossentropy": 2.4455196857452393, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1826869621872902, "step": 10937 }, { "epoch": 0.16332808218666706, "grad_norm": 0.283203125, "grad_norm_var": 0.0025126139322916665, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.6150801181793213, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17527876049280167, "step": 10938 }, { "epoch": 0.16334301435728205, "grad_norm": 0.31640625, "grad_norm_var": 0.0015897115071614584, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.5667572021484375, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19693942368030548, "step": 10939 }, { "epoch": 0.16335794652789704, "grad_norm": 0.2890625, "grad_norm_var": 0.0017480055491129558, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.5826364755630493, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19312280416488647, "step": 10940 }, { "epoch": 0.163372878698512, "grad_norm": 0.26171875, "grad_norm_var": 0.0020669937133789063, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.449235439300537, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19151433557271957, "step": 10941 }, { "epoch": 0.163387810869127, "grad_norm": 0.314453125, "grad_norm_var": 0.002083571751912435, "learning_rate": 0.0001, "loss": 1.4972, "loss/crossentropy": 2.6098963022232056, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2198144495487213, "step": 10942 }, { "epoch": 0.16340274303974198, "grad_norm": 0.294921875, "grad_norm_var": 0.0020903905232747396, "learning_rate": 0.0001, "loss": 1.3822, "loss/crossentropy": 2.6590514183044434, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19856446981430054, "step": 10943 }, { "epoch": 0.16341767521035697, "grad_norm": 0.33203125, "grad_norm_var": 0.0015710035959879557, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.537051200866699, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1824875771999359, "step": 10944 }, { "epoch": 0.16343260738097193, "grad_norm": 0.3203125, "grad_norm_var": 0.0015301863352457682, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.414454460144043, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19181698560714722, "step": 10945 }, { "epoch": 0.16344753955158692, "grad_norm": 0.30859375, "grad_norm_var": 0.0014607588450113933, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.567057251930237, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.217096246778965, "step": 10946 }, { "epoch": 0.1634624717222019, "grad_norm": 0.28125, "grad_norm_var": 0.001551675796508789, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.546539068222046, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1906842440366745, "step": 10947 }, { "epoch": 0.16347740389281687, "grad_norm": 0.333984375, "grad_norm_var": 0.0011962890625, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.680678963661194, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.18515644967556, "step": 10948 }, { "epoch": 0.16349233606343186, "grad_norm": 0.390625, "grad_norm_var": 0.001529677708943685, "learning_rate": 0.0001, "loss": 1.7598, "loss/crossentropy": 2.5502835512161255, "loss/fcd": 1.48046875, "loss/idx": 11.0, "loss/logits": 0.27929599583148956, "step": 10949 }, { "epoch": 0.16350726823404685, "grad_norm": 0.283203125, "grad_norm_var": 0.001323684056599935, "learning_rate": 0.0001, "loss": 1.3833, "loss/crossentropy": 2.6220465898513794, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19583473354578018, "step": 10950 }, { "epoch": 0.1635222004046618, "grad_norm": 0.263671875, "grad_norm_var": 0.0010438124338785807, "learning_rate": 0.0001, "loss": 1.3225, "loss/crossentropy": 2.2543141841888428, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17408234626054764, "step": 10951 }, { "epoch": 0.1635371325752768, "grad_norm": 0.265625, "grad_norm_var": 0.0011319319407145183, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.39047908782959, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18124383687973022, "step": 10952 }, { "epoch": 0.1635520647458918, "grad_norm": 0.25390625, "grad_norm_var": 0.0012338638305664062, "learning_rate": 0.0001, "loss": 1.313, "loss/crossentropy": 2.5426701307296753, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17241178452968597, "step": 10953 }, { "epoch": 0.16356699691650678, "grad_norm": 0.271484375, "grad_norm_var": 0.00126800537109375, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.8033727407455444, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19813811779022217, "step": 10954 }, { "epoch": 0.16358192908712174, "grad_norm": 0.298828125, "grad_norm_var": 0.0012461185455322266, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.794933557510376, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21969885379076004, "step": 10955 }, { "epoch": 0.16359686125773673, "grad_norm": 0.31640625, "grad_norm_var": 0.0012612501780192057, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.4285210371017456, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16705983132123947, "step": 10956 }, { "epoch": 0.16361179342835172, "grad_norm": 0.322265625, "grad_norm_var": 0.0011858622233072917, "learning_rate": 0.0001, "loss": 1.6993, "loss/crossentropy": 2.4457314014434814, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.2617662698030472, "step": 10957 }, { "epoch": 0.16362672559896668, "grad_norm": 0.2890625, "grad_norm_var": 0.0011881351470947265, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.6817818880081177, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18607445806264877, "step": 10958 }, { "epoch": 0.16364165776958167, "grad_norm": 0.283203125, "grad_norm_var": 0.0012072086334228515, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.4397175312042236, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19380761682987213, "step": 10959 }, { "epoch": 0.16365658994019666, "grad_norm": 0.302734375, "grad_norm_var": 0.0011392593383789062, "learning_rate": 0.0001, "loss": 1.4582, "loss/crossentropy": 2.530435800552368, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2082294300198555, "step": 10960 }, { "epoch": 0.16367152211081165, "grad_norm": 0.3125, "grad_norm_var": 0.0011209487915039063, "learning_rate": 0.0001, "loss": 1.5374, "loss/crossentropy": 2.4882712364196777, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.23266570270061493, "step": 10961 }, { "epoch": 0.1636864542814266, "grad_norm": 0.27734375, "grad_norm_var": 0.001140276590983073, "learning_rate": 0.0001, "loss": 1.346, "loss/crossentropy": 2.5351732969284058, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.16238772869110107, "step": 10962 }, { "epoch": 0.1637013864520416, "grad_norm": 0.28515625, "grad_norm_var": 0.0011332194010416667, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.5800544023513794, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19580567628145218, "step": 10963 }, { "epoch": 0.1637163186226566, "grad_norm": 0.3203125, "grad_norm_var": 0.0010772546132405598, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.6708850860595703, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1778666153550148, "step": 10964 }, { "epoch": 0.16373125079327155, "grad_norm": 0.275390625, "grad_norm_var": 0.00045363108317057293, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.526607871055603, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1728207916021347, "step": 10965 }, { "epoch": 0.16374618296388654, "grad_norm": 0.30078125, "grad_norm_var": 0.00045978228251139324, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.586723566055298, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21268215775489807, "step": 10966 }, { "epoch": 0.16376111513450153, "grad_norm": 0.314453125, "grad_norm_var": 0.0004432519276936849, "learning_rate": 0.0001, "loss": 1.5846, "loss/crossentropy": 2.53553307056427, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.25259358435869217, "step": 10967 }, { "epoch": 0.1637760473051165, "grad_norm": 0.30859375, "grad_norm_var": 0.0004012902577718099, "learning_rate": 0.0001, "loss": 1.6127, "loss/crossentropy": 2.4689364433288574, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.25335393100976944, "step": 10968 }, { "epoch": 0.16379097947573149, "grad_norm": 0.291015625, "grad_norm_var": 0.00028018951416015626, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.5656827688217163, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20476359128952026, "step": 10969 }, { "epoch": 0.16380591164634647, "grad_norm": 0.259765625, "grad_norm_var": 0.000330352783203125, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.644415020942688, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19934171438217163, "step": 10970 }, { "epoch": 0.16382084381696146, "grad_norm": 0.291015625, "grad_norm_var": 0.0003326416015625, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.8602585792541504, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.184352844953537, "step": 10971 }, { "epoch": 0.16383577598757643, "grad_norm": 0.32421875, "grad_norm_var": 0.0003568013509114583, "learning_rate": 0.0001, "loss": 1.4156, "loss/crossentropy": 2.6402453184127808, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2007630467414856, "step": 10972 }, { "epoch": 0.16385070815819142, "grad_norm": 0.3671875, "grad_norm_var": 0.0006320794423421224, "learning_rate": 0.0001, "loss": 1.6643, "loss/crossentropy": 2.3787981271743774, "loss/fcd": 1.4296875, "loss/idx": 11.0, "loss/logits": 0.2346576303243637, "step": 10973 }, { "epoch": 0.1638656403288064, "grad_norm": 0.365234375, "grad_norm_var": 0.0008818944295247396, "learning_rate": 0.0001, "loss": 1.5945, "loss/crossentropy": 2.265053451061249, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.22343402355909348, "step": 10974 }, { "epoch": 0.16388057249942137, "grad_norm": 0.42578125, "grad_norm_var": 0.001739358901977539, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.7787699699401855, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.1974720135331154, "step": 10975 }, { "epoch": 0.16389550467003636, "grad_norm": 0.306640625, "grad_norm_var": 0.0017345269521077474, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.167451858520508, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2297227755188942, "step": 10976 }, { "epoch": 0.16391043684065135, "grad_norm": 0.306640625, "grad_norm_var": 0.0017379124959309895, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.5162041187286377, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18530788272619247, "step": 10977 }, { "epoch": 0.16392536901126634, "grad_norm": 0.29296875, "grad_norm_var": 0.0016773859659830728, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.95611035823822, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20959335565567017, "step": 10978 }, { "epoch": 0.1639403011818813, "grad_norm": 0.330078125, "grad_norm_var": 0.0016265710194905598, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.644837498664856, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18563620746135712, "step": 10979 }, { "epoch": 0.1639552333524963, "grad_norm": 0.333984375, "grad_norm_var": 0.00164337158203125, "learning_rate": 0.0001, "loss": 1.4918, "loss/crossentropy": 2.7915035486221313, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21443892270326614, "step": 10980 }, { "epoch": 0.16397016552311128, "grad_norm": 0.376953125, "grad_norm_var": 0.0017061869303385416, "learning_rate": 0.0001, "loss": 1.586, "loss/crossentropy": 2.550040364265442, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.24228473007678986, "step": 10981 }, { "epoch": 0.16398509769372624, "grad_norm": 0.33984375, "grad_norm_var": 0.00167694091796875, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.451618194580078, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20847187936306, "step": 10982 }, { "epoch": 0.16400002986434123, "grad_norm": 0.396484375, "grad_norm_var": 0.0019586563110351564, "learning_rate": 0.0001, "loss": 1.5301, "loss/crossentropy": 2.4217605590820312, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.21761418879032135, "step": 10983 }, { "epoch": 0.16401496203495622, "grad_norm": 0.369140625, "grad_norm_var": 0.001996596654256185, "learning_rate": 0.0001, "loss": 1.5648, "loss/crossentropy": 2.4058172702789307, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.22884328663349152, "step": 10984 }, { "epoch": 0.16402989420557118, "grad_norm": 0.318359375, "grad_norm_var": 0.001879103978474935, "learning_rate": 0.0001, "loss": 1.5562, "loss/crossentropy": 2.3951990604400635, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.22028351575136185, "step": 10985 }, { "epoch": 0.16404482637618617, "grad_norm": 0.3046875, "grad_norm_var": 0.0015380223592122396, "learning_rate": 0.0001, "loss": 1.4875, "loss/crossentropy": 2.6559784412384033, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2257591038942337, "step": 10986 }, { "epoch": 0.16405975854680116, "grad_norm": 0.28515625, "grad_norm_var": 0.0015788873036702473, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.7156519889831543, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20757415890693665, "step": 10987 }, { "epoch": 0.16407469071741615, "grad_norm": 0.50390625, "grad_norm_var": 0.0032137393951416015, "learning_rate": 0.0001, "loss": 1.8228, "loss/crossentropy": 2.3153611421585083, "loss/fcd": 1.5234375, "loss/idx": 11.0, "loss/logits": 0.2993137612938881, "step": 10988 }, { "epoch": 0.1640896228880311, "grad_norm": 0.38671875, "grad_norm_var": 0.0032785892486572265, "learning_rate": 0.0001, "loss": 1.4563, "loss/crossentropy": 2.485298275947571, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19853214919567108, "step": 10989 }, { "epoch": 0.1641045550586461, "grad_norm": 0.3203125, "grad_norm_var": 0.0033294041951497395, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.6402148008346558, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19470474123954773, "step": 10990 }, { "epoch": 0.1641194872292611, "grad_norm": 0.376953125, "grad_norm_var": 0.0029840946197509767, "learning_rate": 0.0001, "loss": 1.612, "loss/crossentropy": 2.190512776374817, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.24483857303857803, "step": 10991 }, { "epoch": 0.16413441939987605, "grad_norm": 0.30078125, "grad_norm_var": 0.003017616271972656, "learning_rate": 0.0001, "loss": 1.3909, "loss/crossentropy": 2.5613903999328613, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19947831332683563, "step": 10992 }, { "epoch": 0.16414935157049104, "grad_norm": 0.3203125, "grad_norm_var": 0.0029567559560139973, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.5323914289474487, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2009587064385414, "step": 10993 }, { "epoch": 0.16416428374110603, "grad_norm": 0.302734375, "grad_norm_var": 0.002891985575358073, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.56355082988739, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19552700221538544, "step": 10994 }, { "epoch": 0.164179215911721, "grad_norm": 0.32421875, "grad_norm_var": 0.002908054987589518, "learning_rate": 0.0001, "loss": 1.3667, "loss/crossentropy": 2.7112843990325928, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.17524658143520355, "step": 10995 }, { "epoch": 0.16419414808233598, "grad_norm": 0.43359375, "grad_norm_var": 0.003348223368326823, "learning_rate": 0.0001, "loss": 1.6691, "loss/crossentropy": 2.9983361959457397, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.23552114516496658, "step": 10996 }, { "epoch": 0.16420908025295097, "grad_norm": 0.3046875, "grad_norm_var": 0.0034511407216389974, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.4653472900390625, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19809047132730484, "step": 10997 }, { "epoch": 0.16422401242356596, "grad_norm": 0.333984375, "grad_norm_var": 0.0034606297810872394, "learning_rate": 0.0001, "loss": 1.5769, "loss/crossentropy": 2.4725455045700073, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.23314417153596878, "step": 10998 }, { "epoch": 0.16423894459418092, "grad_norm": 0.294921875, "grad_norm_var": 0.0034606297810872394, "learning_rate": 0.0001, "loss": 1.3444, "loss/crossentropy": 2.3546040058135986, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18036624789237976, "step": 10999 }, { "epoch": 0.16425387676479591, "grad_norm": 0.5234375, "grad_norm_var": 0.005496072769165039, "learning_rate": 0.0001, "loss": 1.483, "loss/crossentropy": 2.3607442378997803, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21741457283496857, "step": 11000 }, { "epoch": 0.1642688089354109, "grad_norm": 0.298828125, "grad_norm_var": 0.005607970555623372, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.5561749935150146, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2112797573208809, "step": 11001 }, { "epoch": 0.16428374110602587, "grad_norm": 0.314453125, "grad_norm_var": 0.005553690592447916, "learning_rate": 0.0001, "loss": 1.5069, "loss/crossentropy": 2.4884153604507446, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2178811952471733, "step": 11002 }, { "epoch": 0.16429867327664086, "grad_norm": 0.30078125, "grad_norm_var": 0.00543060302734375, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.60225772857666, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18780747056007385, "step": 11003 }, { "epoch": 0.16431360544725584, "grad_norm": 0.3046875, "grad_norm_var": 0.0038904190063476563, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.5948041677474976, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1972067952156067, "step": 11004 }, { "epoch": 0.16432853761787083, "grad_norm": 0.287109375, "grad_norm_var": 0.0038912296295166016, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.6995474100112915, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22290394455194473, "step": 11005 }, { "epoch": 0.1643434697884858, "grad_norm": 0.384765625, "grad_norm_var": 0.004034423828125, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.476089119911194, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2303316667675972, "step": 11006 }, { "epoch": 0.16435840195910079, "grad_norm": 0.310546875, "grad_norm_var": 0.00396416982014974, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.5766637325286865, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18723683059215546, "step": 11007 }, { "epoch": 0.16437333412971578, "grad_norm": 0.291015625, "grad_norm_var": 0.00401304562886556, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.8279069662094116, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2109116166830063, "step": 11008 }, { "epoch": 0.16438826630033074, "grad_norm": 0.3203125, "grad_norm_var": 0.00401304562886556, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.685703158378601, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.190902478992939, "step": 11009 }, { "epoch": 0.16440319847094573, "grad_norm": 0.345703125, "grad_norm_var": 0.003954299290974935, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.4455974102020264, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.23941553384065628, "step": 11010 }, { "epoch": 0.16441813064156072, "grad_norm": 0.416015625, "grad_norm_var": 0.004339027404785156, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.4793636798858643, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20678862929344177, "step": 11011 }, { "epoch": 0.16443306281217568, "grad_norm": 0.28515625, "grad_norm_var": 0.0038944880167643228, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.56273877620697, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20311643183231354, "step": 11012 }, { "epoch": 0.16444799498279067, "grad_norm": 0.314453125, "grad_norm_var": 0.0038645267486572266, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.450145125389099, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18325405567884445, "step": 11013 }, { "epoch": 0.16446292715340566, "grad_norm": 0.318359375, "grad_norm_var": 0.0038774967193603515, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.5313730239868164, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18982506543397903, "step": 11014 }, { "epoch": 0.16447785932402065, "grad_norm": 0.291015625, "grad_norm_var": 0.0038977146148681642, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.7260804176330566, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2008904293179512, "step": 11015 }, { "epoch": 0.1644927914946356, "grad_norm": 0.2578125, "grad_norm_var": 0.0015155633290608725, "learning_rate": 0.0001, "loss": 1.2974, "loss/crossentropy": 2.5214293003082275, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.168486550450325, "step": 11016 }, { "epoch": 0.1645077236652506, "grad_norm": 0.318359375, "grad_norm_var": 0.0014971256256103515, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.493959665298462, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21373505145311356, "step": 11017 }, { "epoch": 0.1645226558358656, "grad_norm": 0.28125, "grad_norm_var": 0.0015741348266601562, "learning_rate": 0.0001, "loss": 1.3615, "loss/crossentropy": 2.815467119216919, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19350608438253403, "step": 11018 }, { "epoch": 0.16453758800648055, "grad_norm": 0.5859375, "grad_norm_var": 0.0061457316080729164, "learning_rate": 0.0001, "loss": 1.6961, "loss/crossentropy": 2.554149627685547, "loss/fcd": 1.47265625, "loss/idx": 11.0, "loss/logits": 0.22341589629650116, "step": 11019 }, { "epoch": 0.16455252017709554, "grad_norm": 0.31640625, "grad_norm_var": 0.0061115900675455725, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.774269938468933, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21503029763698578, "step": 11020 }, { "epoch": 0.16456745234771053, "grad_norm": 0.29296875, "grad_norm_var": 0.006078068415323893, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.442627191543579, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1950351893901825, "step": 11021 }, { "epoch": 0.16458238451832552, "grad_norm": 0.3203125, "grad_norm_var": 0.0058939615885416664, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.4359588623046875, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20161838084459305, "step": 11022 }, { "epoch": 0.16459731668894048, "grad_norm": 0.2890625, "grad_norm_var": 0.005975961685180664, "learning_rate": 0.0001, "loss": 1.4469, "loss/crossentropy": 2.468379259109497, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20859186351299286, "step": 11023 }, { "epoch": 0.16461224885955547, "grad_norm": 0.27734375, "grad_norm_var": 0.00605462392171224, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.212713599205017, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2341982051730156, "step": 11024 }, { "epoch": 0.16462718103017046, "grad_norm": 0.33203125, "grad_norm_var": 0.006052907307942708, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.6306087970733643, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2310250997543335, "step": 11025 }, { "epoch": 0.16464211320078542, "grad_norm": 0.298828125, "grad_norm_var": 0.006077321370442709, "learning_rate": 0.0001, "loss": 1.3351, "loss/crossentropy": 2.5014290809631348, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17889151722192764, "step": 11026 }, { "epoch": 0.1646570453714004, "grad_norm": 0.291015625, "grad_norm_var": 0.005532073974609375, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.6745142936706543, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16698771715164185, "step": 11027 }, { "epoch": 0.1646719775420154, "grad_norm": 0.33984375, "grad_norm_var": 0.005487569173177083, "learning_rate": 0.0001, "loss": 1.4434, "loss/crossentropy": 2.612790107727051, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20116552710533142, "step": 11028 }, { "epoch": 0.16468690971263036, "grad_norm": 0.28515625, "grad_norm_var": 0.005564101537068685, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 2.6970890760421753, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1871115192770958, "step": 11029 }, { "epoch": 0.16470184188324535, "grad_norm": 0.349609375, "grad_norm_var": 0.005624628067016602, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.8503576517105103, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17985112965106964, "step": 11030 }, { "epoch": 0.16471677405386034, "grad_norm": 0.44921875, "grad_norm_var": 0.006568336486816406, "learning_rate": 0.0001, "loss": 1.5691, "loss/crossentropy": 2.773330330848694, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.24877993762493134, "step": 11031 }, { "epoch": 0.16473170622447533, "grad_norm": 0.3359375, "grad_norm_var": 0.006194496154785156, "learning_rate": 0.0001, "loss": 1.4236, "loss/crossentropy": 2.4920071363449097, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1931317076086998, "step": 11032 }, { "epoch": 0.1647466383950903, "grad_norm": 0.294921875, "grad_norm_var": 0.006281471252441407, "learning_rate": 0.0001, "loss": 1.6461, "loss/crossentropy": 2.578394651412964, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.2672045826911926, "step": 11033 }, { "epoch": 0.16476157056570528, "grad_norm": 0.2890625, "grad_norm_var": 0.006230608622233073, "learning_rate": 0.0001, "loss": 1.3421, "loss/crossentropy": 2.41552996635437, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18192770332098007, "step": 11034 }, { "epoch": 0.16477650273632027, "grad_norm": 0.32421875, "grad_norm_var": 0.001728057861328125, "learning_rate": 0.0001, "loss": 1.6304, "loss/crossentropy": 2.599447011947632, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.259341225028038, "step": 11035 }, { "epoch": 0.16479143490693524, "grad_norm": 0.318359375, "grad_norm_var": 0.001727914810180664, "learning_rate": 0.0001, "loss": 1.6628, "loss/crossentropy": 2.767750382423401, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.25653956830501556, "step": 11036 }, { "epoch": 0.16480636707755023, "grad_norm": 0.294921875, "grad_norm_var": 0.001721636454264323, "learning_rate": 0.0001, "loss": 1.514, "loss/crossentropy": 2.7768044471740723, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.224918432533741, "step": 11037 }, { "epoch": 0.16482129924816522, "grad_norm": 0.291015625, "grad_norm_var": 0.0017666975657145182, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.7134629487991333, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18852641433477402, "step": 11038 }, { "epoch": 0.1648362314187802, "grad_norm": 0.328125, "grad_norm_var": 0.001720285415649414, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.794131875038147, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19005482643842697, "step": 11039 }, { "epoch": 0.16485116358939517, "grad_norm": 0.298828125, "grad_norm_var": 0.0016305923461914062, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.7829986810684204, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20458029955625534, "step": 11040 }, { "epoch": 0.16486609576001016, "grad_norm": 0.3125, "grad_norm_var": 0.0016232808430989584, "learning_rate": 0.0001, "loss": 1.6203, "loss/crossentropy": 2.0080920457839966, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2335619255900383, "step": 11041 }, { "epoch": 0.16488102793062515, "grad_norm": 0.361328125, "grad_norm_var": 0.001700592041015625, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.6681236028671265, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.25013960897922516, "step": 11042 }, { "epoch": 0.1648959601012401, "grad_norm": 0.287109375, "grad_norm_var": 0.0017180760701497396, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.701100468635559, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2083214670419693, "step": 11043 }, { "epoch": 0.1649108922718551, "grad_norm": 0.318359375, "grad_norm_var": 0.0016972700754801433, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.671782374382019, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1870742216706276, "step": 11044 }, { "epoch": 0.1649258244424701, "grad_norm": 0.30078125, "grad_norm_var": 0.0016375064849853515, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.5134713649749756, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2124827653169632, "step": 11045 }, { "epoch": 0.16494075661308505, "grad_norm": 0.28515625, "grad_norm_var": 0.0016611099243164062, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.67908251285553, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16396038979291916, "step": 11046 }, { "epoch": 0.16495568878370004, "grad_norm": 0.341796875, "grad_norm_var": 0.0005045413970947265, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.7742608785629272, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.2512257620692253, "step": 11047 }, { "epoch": 0.16497062095431503, "grad_norm": 0.408203125, "grad_norm_var": 0.001067352294921875, "learning_rate": 0.0001, "loss": 1.7123, "loss/crossentropy": 2.9800959825515747, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.3060711473226547, "step": 11048 }, { "epoch": 0.16498555312493002, "grad_norm": 1.3828125, "grad_norm_var": 0.07199095090230306, "learning_rate": 0.0001, "loss": 2.1058, "loss/crossentropy": 2.8006240129470825, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.6019350960850716, "step": 11049 }, { "epoch": 0.16500048529554498, "grad_norm": 0.2734375, "grad_norm_var": 0.07220381100972494, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.449586033821106, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18757469952106476, "step": 11050 }, { "epoch": 0.16501541746615997, "grad_norm": 0.365234375, "grad_norm_var": 0.07198785146077474, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.380818724632263, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20058849453926086, "step": 11051 }, { "epoch": 0.16503034963677496, "grad_norm": 0.32421875, "grad_norm_var": 0.07193754514058431, "learning_rate": 0.0001, "loss": 1.4668, "loss/crossentropy": 2.7614691257476807, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21284763514995575, "step": 11052 }, { "epoch": 0.16504528180738992, "grad_norm": 0.28515625, "grad_norm_var": 0.07206192016601562, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.6094906330108643, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19214142858982086, "step": 11053 }, { "epoch": 0.1650602139780049, "grad_norm": 0.322265625, "grad_norm_var": 0.0717302958170573, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.849372148513794, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16070982068777084, "step": 11054 }, { "epoch": 0.1650751461486199, "grad_norm": 0.302734375, "grad_norm_var": 0.07197060585021972, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.6421010494232178, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19465594738721848, "step": 11055 }, { "epoch": 0.16509007831923486, "grad_norm": 0.3125, "grad_norm_var": 0.0718240737915039, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.3864586353302, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20869207382202148, "step": 11056 }, { "epoch": 0.16510501048984985, "grad_norm": 0.318359375, "grad_norm_var": 0.0717684268951416, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.705499291419983, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17597020417451859, "step": 11057 }, { "epoch": 0.16511994266046484, "grad_norm": 0.287109375, "grad_norm_var": 0.07236517270406087, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.4751540422439575, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18416303396224976, "step": 11058 }, { "epoch": 0.16513487483107983, "grad_norm": 0.326171875, "grad_norm_var": 0.07196526527404785, "learning_rate": 0.0001, "loss": 1.5751, "loss/crossentropy": 2.6547516584396362, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.243090458214283, "step": 11059 }, { "epoch": 0.1651498070016948, "grad_norm": 0.28515625, "grad_norm_var": 0.0723276138305664, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.846866726875305, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19192201644182205, "step": 11060 }, { "epoch": 0.16516473917230978, "grad_norm": 0.271484375, "grad_norm_var": 0.07270073890686035, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.53082013130188, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1703079640865326, "step": 11061 }, { "epoch": 0.16517967134292477, "grad_norm": 0.322265625, "grad_norm_var": 0.07231388092041016, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.5114588737487793, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19643063843250275, "step": 11062 }, { "epoch": 0.16519460351353973, "grad_norm": 0.314453125, "grad_norm_var": 0.0725110371907552, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.6895103454589844, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20243743807077408, "step": 11063 }, { "epoch": 0.16520953568415472, "grad_norm": 0.314453125, "grad_norm_var": 0.0727246602376302, "learning_rate": 0.0001, "loss": 1.4461, "loss/crossentropy": 2.695860743522644, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21559828519821167, "step": 11064 }, { "epoch": 0.1652244678547697, "grad_norm": 0.255859375, "grad_norm_var": 0.0007401625315348307, "learning_rate": 0.0001, "loss": 1.312, "loss/crossentropy": 2.661701202392578, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1791834458708763, "step": 11065 }, { "epoch": 0.1652394000253847, "grad_norm": 0.2734375, "grad_norm_var": 0.0007401625315348307, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.598586320877075, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.1721460372209549, "step": 11066 }, { "epoch": 0.16525433219599966, "grad_norm": 0.34765625, "grad_norm_var": 0.0006184260050455729, "learning_rate": 0.0001, "loss": 1.5353, "loss/crossentropy": 2.7043405771255493, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.2071627676486969, "step": 11067 }, { "epoch": 0.16526926436661465, "grad_norm": 0.29296875, "grad_norm_var": 0.0005950291951497395, "learning_rate": 0.0001, "loss": 1.4839, "loss/crossentropy": 2.3857158422470093, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21830317378044128, "step": 11068 }, { "epoch": 0.16528419653722964, "grad_norm": 0.279296875, "grad_norm_var": 0.0006103356679280599, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 2.5941396951675415, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20089223235845566, "step": 11069 }, { "epoch": 0.1652991287078446, "grad_norm": 0.3203125, "grad_norm_var": 0.0006052017211914063, "learning_rate": 0.0001, "loss": 1.5948, "loss/crossentropy": 2.3902798891067505, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.23547260463237762, "step": 11070 }, { "epoch": 0.1653140608784596, "grad_norm": 0.3046875, "grad_norm_var": 0.0006057580312093098, "learning_rate": 0.0001, "loss": 1.4873, "loss/crossentropy": 2.608621597290039, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21389640122652054, "step": 11071 }, { "epoch": 0.16532899304907459, "grad_norm": 0.3984375, "grad_norm_var": 0.0011918226877848308, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7869133949279785, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21959201246500015, "step": 11072 }, { "epoch": 0.16534392521968955, "grad_norm": 0.359375, "grad_norm_var": 0.0013590494791666666, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.467186689376831, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1648659035563469, "step": 11073 }, { "epoch": 0.16535885739030454, "grad_norm": 0.251953125, "grad_norm_var": 0.001541582743326823, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.421812891960144, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1648138463497162, "step": 11074 }, { "epoch": 0.16537378956091953, "grad_norm": 0.318359375, "grad_norm_var": 0.0015258153279622396, "learning_rate": 0.0001, "loss": 1.3652, "loss/crossentropy": 2.912403106689453, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18160776793956757, "step": 11075 }, { "epoch": 0.16538872173153452, "grad_norm": 0.275390625, "grad_norm_var": 0.001560068130493164, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.657392144203186, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19239593297243118, "step": 11076 }, { "epoch": 0.16540365390214948, "grad_norm": 0.330078125, "grad_norm_var": 0.001502847671508789, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.663680911064148, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.19900934398174286, "step": 11077 }, { "epoch": 0.16541858607276447, "grad_norm": 0.296875, "grad_norm_var": 0.0015014012654622395, "learning_rate": 0.0001, "loss": 1.344, "loss/crossentropy": 2.581374168395996, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1759883463382721, "step": 11078 }, { "epoch": 0.16543351824337946, "grad_norm": 0.3125, "grad_norm_var": 0.0015000502268473308, "learning_rate": 0.0001, "loss": 1.4935, "loss/crossentropy": 2.4458800554275513, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.24346904456615448, "step": 11079 }, { "epoch": 0.16544845041399442, "grad_norm": 0.3125, "grad_norm_var": 0.0014986673990885417, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.7102901935577393, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.2002018764615059, "step": 11080 }, { "epoch": 0.1654633825846094, "grad_norm": 0.3203125, "grad_norm_var": 0.0013093153635660808, "learning_rate": 0.0001, "loss": 1.5787, "loss/crossentropy": 2.49763286113739, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.24666451662778854, "step": 11081 }, { "epoch": 0.1654783147552244, "grad_norm": 0.294921875, "grad_norm_var": 0.0012273152669270834, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.706109642982483, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19593417644500732, "step": 11082 }, { "epoch": 0.1654932469258394, "grad_norm": 0.29296875, "grad_norm_var": 0.001165008544921875, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.716760993003845, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18157991766929626, "step": 11083 }, { "epoch": 0.16550817909645435, "grad_norm": 0.27734375, "grad_norm_var": 0.0012158711751302084, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.6269962787628174, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17801406979560852, "step": 11084 }, { "epoch": 0.16552311126706934, "grad_norm": 0.298828125, "grad_norm_var": 0.0011621475219726562, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.5761696100234985, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.17605935037136078, "step": 11085 }, { "epoch": 0.16553804343768433, "grad_norm": 0.32421875, "grad_norm_var": 0.0011683146158854166, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.683514952659607, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.17469053715467453, "step": 11086 }, { "epoch": 0.1655529756082993, "grad_norm": 0.287109375, "grad_norm_var": 0.0012013594309488933, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.7263368368148804, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19420050829648972, "step": 11087 }, { "epoch": 0.16556790777891428, "grad_norm": 0.388671875, "grad_norm_var": 0.001091448465983073, "learning_rate": 0.0001, "loss": 1.6308, "loss/crossentropy": 2.775664806365967, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.23233585804700851, "step": 11088 }, { "epoch": 0.16558283994952927, "grad_norm": 0.376953125, "grad_norm_var": 0.0012292067209879558, "learning_rate": 0.0001, "loss": 1.4715, "loss/crossentropy": 2.795540928840637, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.18639026582241058, "step": 11089 }, { "epoch": 0.16559777212014423, "grad_norm": 0.400390625, "grad_norm_var": 0.0014587243398030598, "learning_rate": 0.0001, "loss": 1.4938, "loss/crossentropy": 2.9397770166397095, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2360268086194992, "step": 11090 }, { "epoch": 0.16561270429075922, "grad_norm": 0.36328125, "grad_norm_var": 0.001579729715983073, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.613964557647705, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.18935686349868774, "step": 11091 }, { "epoch": 0.1656276364613742, "grad_norm": 0.33203125, "grad_norm_var": 0.001428079605102539, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 2.3881982564926147, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20011600106954575, "step": 11092 }, { "epoch": 0.1656425686319892, "grad_norm": 0.275390625, "grad_norm_var": 0.0015820662180582683, "learning_rate": 0.0001, "loss": 1.3252, "loss/crossentropy": 2.4700390100479126, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1767655834555626, "step": 11093 }, { "epoch": 0.16565750080260416, "grad_norm": 0.6953125, "grad_norm_var": 0.010161701838175457, "learning_rate": 0.0001, "loss": 1.615, "loss/crossentropy": 2.4532630443573, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.22047099471092224, "step": 11094 }, { "epoch": 0.16567243297321915, "grad_norm": 0.326171875, "grad_norm_var": 0.010110410054524739, "learning_rate": 0.0001, "loss": 1.2586, "loss/crossentropy": 2.4511711597442627, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16092438250780106, "step": 11095 }, { "epoch": 0.16568736514383414, "grad_norm": 0.32421875, "grad_norm_var": 0.010063680013020833, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.6745811700820923, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1876194179058075, "step": 11096 }, { "epoch": 0.1657022973144491, "grad_norm": 0.369140625, "grad_norm_var": 0.010028314590454102, "learning_rate": 0.0001, "loss": 1.4861, "loss/crossentropy": 2.6765884160995483, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20482581853866577, "step": 11097 }, { "epoch": 0.1657172294850641, "grad_norm": 0.322265625, "grad_norm_var": 0.009868097305297852, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.339818239212036, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.1961873471736908, "step": 11098 }, { "epoch": 0.16573216165567908, "grad_norm": 0.296875, "grad_norm_var": 0.009837579727172852, "learning_rate": 0.0001, "loss": 1.3345, "loss/crossentropy": 2.639102816581726, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18610521405935287, "step": 11099 }, { "epoch": 0.16574709382629407, "grad_norm": 0.33203125, "grad_norm_var": 0.00946818987528483, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.7426421642303467, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21762161701917648, "step": 11100 }, { "epoch": 0.16576202599690903, "grad_norm": 0.283203125, "grad_norm_var": 0.009604756037394207, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.7173320055007935, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17990083247423172, "step": 11101 }, { "epoch": 0.16577695816752402, "grad_norm": 0.298828125, "grad_norm_var": 0.009752909342447916, "learning_rate": 0.0001, "loss": 1.4802, "loss/crossentropy": 2.6387524604797363, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2224164456129074, "step": 11102 }, { "epoch": 0.16579189033813901, "grad_norm": 0.314453125, "grad_norm_var": 0.009553972880045574, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.468829393386841, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20109768211841583, "step": 11103 }, { "epoch": 0.16580682250875398, "grad_norm": 0.314453125, "grad_norm_var": 0.009576924641927083, "learning_rate": 0.0001, "loss": 1.49, "loss/crossentropy": 2.6048383712768555, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22441846132278442, "step": 11104 }, { "epoch": 0.16582175467936897, "grad_norm": 0.28125, "grad_norm_var": 0.009825372695922851, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.4837799072265625, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18750295042991638, "step": 11105 }, { "epoch": 0.16583668684998396, "grad_norm": 0.29296875, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 1.3452, "loss/crossentropy": 2.682545781135559, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1772790178656578, "step": 11106 }, { "epoch": 0.16585161902059892, "grad_norm": 0.30078125, "grad_norm_var": 0.00980224609375, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.7100459337234497, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21496079117059708, "step": 11107 }, { "epoch": 0.1658665511912139, "grad_norm": 0.3359375, "grad_norm_var": 0.009801673889160156, "learning_rate": 0.0001, "loss": 1.6878, "loss/crossentropy": 2.5312386751174927, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2776213437318802, "step": 11108 }, { "epoch": 0.1658814833618289, "grad_norm": 0.318359375, "grad_norm_var": 0.009574381510416667, "learning_rate": 0.0001, "loss": 1.3729, "loss/crossentropy": 2.5047343969345093, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.17763662338256836, "step": 11109 }, { "epoch": 0.16589641553244389, "grad_norm": 0.25390625, "grad_norm_var": 0.0007160822550455729, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.3529231548309326, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.15743338316679, "step": 11110 }, { "epoch": 0.16591134770305885, "grad_norm": 0.30078125, "grad_norm_var": 0.0007026513417561849, "learning_rate": 0.0001, "loss": 1.5193, "loss/crossentropy": 2.42548406124115, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.24193225055933, "step": 11111 }, { "epoch": 0.16592627987367384, "grad_norm": 0.322265625, "grad_norm_var": 0.0006988525390625, "learning_rate": 0.0001, "loss": 1.7023, "loss/crossentropy": 2.588403582572937, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.29210948944091797, "step": 11112 }, { "epoch": 0.16594121204428883, "grad_norm": 0.34375, "grad_norm_var": 0.0005341688791910807, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.7262390851974487, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.15818904340267181, "step": 11113 }, { "epoch": 0.1659561442149038, "grad_norm": 0.322265625, "grad_norm_var": 0.0005341688791910807, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.8385244607925415, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19272176921367645, "step": 11114 }, { "epoch": 0.16597107638551878, "grad_norm": 0.318359375, "grad_norm_var": 0.0005339940388997395, "learning_rate": 0.0001, "loss": 1.3534, "loss/crossentropy": 2.729763984680176, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18934479355812073, "step": 11115 }, { "epoch": 0.16598600855613377, "grad_norm": 0.291015625, "grad_norm_var": 0.0005096276601155599, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.582001805305481, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20559502393007278, "step": 11116 }, { "epoch": 0.16600094072674873, "grad_norm": 0.392578125, "grad_norm_var": 0.0009279727935791015, "learning_rate": 0.0001, "loss": 1.773, "loss/crossentropy": 2.5669662952423096, "loss/fcd": 1.4921875, "loss/idx": 11.0, "loss/logits": 0.28077754378318787, "step": 11117 }, { "epoch": 0.16601587289736372, "grad_norm": 0.306640625, "grad_norm_var": 0.0009174187978108724, "learning_rate": 0.0001, "loss": 1.4401, "loss/crossentropy": 2.8311811685562134, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20573939383029938, "step": 11118 }, { "epoch": 0.1660308050679787, "grad_norm": 0.306640625, "grad_norm_var": 0.0009198347727457682, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.891479253768921, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1889568790793419, "step": 11119 }, { "epoch": 0.1660457372385937, "grad_norm": 0.291015625, "grad_norm_var": 0.0009484450022379557, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.6750237941741943, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20626714825630188, "step": 11120 }, { "epoch": 0.16606066940920866, "grad_norm": 0.330078125, "grad_norm_var": 0.0009027481079101563, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 2.5635985136032104, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.16616003960371017, "step": 11121 }, { "epoch": 0.16607560157982365, "grad_norm": 0.294921875, "grad_norm_var": 0.0008974552154541015, "learning_rate": 0.0001, "loss": 1.4791, "loss/crossentropy": 2.4058148860931396, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20564629137516022, "step": 11122 }, { "epoch": 0.16609053375043864, "grad_norm": 0.37890625, "grad_norm_var": 0.0011377811431884765, "learning_rate": 0.0001, "loss": 1.6884, "loss/crossentropy": 2.8800991773605347, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.274386465549469, "step": 11123 }, { "epoch": 0.1661054659210536, "grad_norm": 0.365234375, "grad_norm_var": 0.0012567520141601562, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.6628434658050537, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1742621660232544, "step": 11124 }, { "epoch": 0.1661203980916686, "grad_norm": 0.296875, "grad_norm_var": 0.0012932936350504556, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.447953939437866, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20486818999052048, "step": 11125 }, { "epoch": 0.16613533026228358, "grad_norm": 0.287109375, "grad_norm_var": 0.0010709126790364584, "learning_rate": 0.0001, "loss": 1.3603, "loss/crossentropy": 2.4912296533584595, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1805652379989624, "step": 11126 }, { "epoch": 0.16615026243289857, "grad_norm": 0.37109375, "grad_norm_var": 0.0011830647786458333, "learning_rate": 0.0001, "loss": 1.8269, "loss/crossentropy": 2.5931572914123535, "loss/fcd": 1.51171875, "loss/idx": 11.0, "loss/logits": 0.3151448592543602, "step": 11127 }, { "epoch": 0.16616519460351353, "grad_norm": 0.294921875, "grad_norm_var": 0.001244036356608073, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.495198607444763, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2083134725689888, "step": 11128 }, { "epoch": 0.16618012677412852, "grad_norm": 0.353515625, "grad_norm_var": 0.0012751102447509765, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.495370626449585, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18077657371759415, "step": 11129 }, { "epoch": 0.1661950589447435, "grad_norm": 0.333984375, "grad_norm_var": 0.001279306411743164, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.870803713798523, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21089936792850494, "step": 11130 }, { "epoch": 0.16620999111535847, "grad_norm": 0.28125, "grad_norm_var": 0.0014022191365559895, "learning_rate": 0.0001, "loss": 1.2717, "loss/crossentropy": 2.527489185333252, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.17015284299850464, "step": 11131 }, { "epoch": 0.16622492328597346, "grad_norm": 0.310546875, "grad_norm_var": 0.0013415018717447917, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.586340308189392, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.213514506816864, "step": 11132 }, { "epoch": 0.16623985545658845, "grad_norm": 0.294921875, "grad_norm_var": 0.0010538101196289062, "learning_rate": 0.0001, "loss": 1.4248, "loss/crossentropy": 2.5876704454421997, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19821985065937042, "step": 11133 }, { "epoch": 0.16625478762720342, "grad_norm": 0.30859375, "grad_norm_var": 0.0010509332021077474, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.4495689868927, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16559897363185883, "step": 11134 }, { "epoch": 0.1662697197978184, "grad_norm": 0.287109375, "grad_norm_var": 0.00110624631245931, "learning_rate": 0.0001, "loss": 1.3901, "loss/crossentropy": 2.811445713043213, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19475102424621582, "step": 11135 }, { "epoch": 0.1662846519684334, "grad_norm": 0.279296875, "grad_norm_var": 0.0011562188466389975, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.604868531227112, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19731060415506363, "step": 11136 }, { "epoch": 0.16629958413904838, "grad_norm": 0.28125, "grad_norm_var": 0.0012186050415039062, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.6915444135665894, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2102041393518448, "step": 11137 }, { "epoch": 0.16631451630966335, "grad_norm": 0.298828125, "grad_norm_var": 0.0012097676595052083, "learning_rate": 0.0001, "loss": 1.3905, "loss/crossentropy": 2.6960558891296387, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18343497812747955, "step": 11138 }, { "epoch": 0.16632944848027834, "grad_norm": 0.38671875, "grad_norm_var": 0.0012812296549479167, "learning_rate": 0.0001, "loss": 1.6404, "loss/crossentropy": 2.48395836353302, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2693561017513275, "step": 11139 }, { "epoch": 0.16634438065089333, "grad_norm": 0.28515625, "grad_norm_var": 0.00113981564839681, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.613737463951111, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18402539193630219, "step": 11140 }, { "epoch": 0.1663593128215083, "grad_norm": 0.29296875, "grad_norm_var": 0.001147317886352539, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.3495386838912964, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.24675023555755615, "step": 11141 }, { "epoch": 0.16637424499212328, "grad_norm": 0.279296875, "grad_norm_var": 0.001174147923787435, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.4645304679870605, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1755402609705925, "step": 11142 }, { "epoch": 0.16638917716273827, "grad_norm": 0.353515625, "grad_norm_var": 0.0010472615559895834, "learning_rate": 0.0001, "loss": 1.4801, "loss/crossentropy": 2.8629857301712036, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21449527144432068, "step": 11143 }, { "epoch": 0.16640410933335326, "grad_norm": 0.283203125, "grad_norm_var": 0.0010756810506184896, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.517114758491516, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.165701761841774, "step": 11144 }, { "epoch": 0.16641904150396822, "grad_norm": 0.275390625, "grad_norm_var": 0.0009714126586914063, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.533381462097168, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2351597547531128, "step": 11145 }, { "epoch": 0.1664339736745832, "grad_norm": 0.302734375, "grad_norm_var": 0.0008991877237955729, "learning_rate": 0.0001, "loss": 1.498, "loss/crossentropy": 2.50685453414917, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22061140090227127, "step": 11146 }, { "epoch": 0.1664489058451982, "grad_norm": 0.326171875, "grad_norm_var": 0.0009127140045166015, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.76334547996521, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2096668928861618, "step": 11147 }, { "epoch": 0.16646383801581316, "grad_norm": 0.29296875, "grad_norm_var": 0.00091400146484375, "learning_rate": 0.0001, "loss": 1.3922, "loss/crossentropy": 2.632255792617798, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19684797525405884, "step": 11148 }, { "epoch": 0.16647877018642815, "grad_norm": 0.39453125, "grad_norm_var": 0.001443338394165039, "learning_rate": 0.0001, "loss": 1.5596, "loss/crossentropy": 2.320199728012085, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.20803353190422058, "step": 11149 }, { "epoch": 0.16649370235704314, "grad_norm": 0.28125, "grad_norm_var": 0.0014878431955973306, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.6223174333572388, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17347443103790283, "step": 11150 }, { "epoch": 0.1665086345276581, "grad_norm": 0.296875, "grad_norm_var": 0.0014688491821289063, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.7684437036514282, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1943095251917839, "step": 11151 }, { "epoch": 0.1665235666982731, "grad_norm": 0.36328125, "grad_norm_var": 0.0016007582346598307, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.4781949520111084, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19638024270534515, "step": 11152 }, { "epoch": 0.16653849886888808, "grad_norm": 0.296875, "grad_norm_var": 0.001551675796508789, "learning_rate": 0.0001, "loss": 1.5547, "loss/crossentropy": 2.403424024581909, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.22652573138475418, "step": 11153 }, { "epoch": 0.16655343103950307, "grad_norm": 0.298828125, "grad_norm_var": 0.001551675796508789, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.5883097648620605, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22386206686496735, "step": 11154 }, { "epoch": 0.16656836321011803, "grad_norm": 0.29296875, "grad_norm_var": 0.001180887222290039, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.5504941940307617, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1807655766606331, "step": 11155 }, { "epoch": 0.16658329538073302, "grad_norm": 0.314453125, "grad_norm_var": 0.001148223876953125, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.5974231958389282, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18425846099853516, "step": 11156 }, { "epoch": 0.166598227551348, "grad_norm": 0.29296875, "grad_norm_var": 0.001148223876953125, "learning_rate": 0.0001, "loss": 1.4479, "loss/crossentropy": 2.411039113998413, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1979224979877472, "step": 11157 }, { "epoch": 0.16661315972196297, "grad_norm": 0.31640625, "grad_norm_var": 0.0010869185129801433, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.6094441413879395, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21558687835931778, "step": 11158 }, { "epoch": 0.16662809189257796, "grad_norm": 0.27734375, "grad_norm_var": 0.0010218302408854167, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.5403809547424316, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1951635479927063, "step": 11159 }, { "epoch": 0.16664302406319295, "grad_norm": 0.37890625, "grad_norm_var": 0.0012952009836832681, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.6463284492492676, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.25834332406520844, "step": 11160 }, { "epoch": 0.16665795623380794, "grad_norm": 0.34765625, "grad_norm_var": 0.0012628555297851563, "learning_rate": 0.0001, "loss": 1.4797, "loss/crossentropy": 2.649124503135681, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2101440727710724, "step": 11161 }, { "epoch": 0.1666728884044229, "grad_norm": 0.294921875, "grad_norm_var": 0.0012816747029622396, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.6926769018173218, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.208351232111454, "step": 11162 }, { "epoch": 0.1666878205750379, "grad_norm": 0.376953125, "grad_norm_var": 0.0015073140462239583, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.545274257659912, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.24832560867071152, "step": 11163 }, { "epoch": 0.16670275274565288, "grad_norm": 0.279296875, "grad_norm_var": 0.0015679518381754556, "learning_rate": 0.0001, "loss": 1.2979, "loss/crossentropy": 2.5430983304977417, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17682478576898575, "step": 11164 }, { "epoch": 0.16671768491626784, "grad_norm": 0.38671875, "grad_norm_var": 0.0014930566151936848, "learning_rate": 0.0001, "loss": 1.6207, "loss/crossentropy": 2.899930477142334, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.25346288084983826, "step": 11165 }, { "epoch": 0.16673261708688283, "grad_norm": 0.294921875, "grad_norm_var": 0.0014368693033854167, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.8268849849700928, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19762082397937775, "step": 11166 }, { "epoch": 0.16674754925749782, "grad_norm": 0.369140625, "grad_norm_var": 0.0015468438466389974, "learning_rate": 0.0001, "loss": 1.3576, "loss/crossentropy": 2.626080274581909, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18961163610219955, "step": 11167 }, { "epoch": 0.16676248142811279, "grad_norm": 0.30078125, "grad_norm_var": 0.001462411880493164, "learning_rate": 0.0001, "loss": 1.488, "loss/crossentropy": 2.598422050476074, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2223285511136055, "step": 11168 }, { "epoch": 0.16677741359872778, "grad_norm": 0.3125, "grad_norm_var": 0.001429605484008789, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.686526298522949, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.23959983885288239, "step": 11169 }, { "epoch": 0.16679234576934276, "grad_norm": 0.33203125, "grad_norm_var": 0.0014006932576497396, "learning_rate": 0.0001, "loss": 1.5553, "loss/crossentropy": 2.564173460006714, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.21541491150856018, "step": 11170 }, { "epoch": 0.16680727793995775, "grad_norm": 0.328125, "grad_norm_var": 0.0013371785481770834, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.5343843698501587, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20341059565544128, "step": 11171 }, { "epoch": 0.16682221011057272, "grad_norm": 0.3125, "grad_norm_var": 0.0013402144114176432, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.7173972129821777, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20937057584524155, "step": 11172 }, { "epoch": 0.1668371422811877, "grad_norm": 0.29296875, "grad_norm_var": 0.0013402144114176432, "learning_rate": 0.0001, "loss": 1.3446, "loss/crossentropy": 2.556907057762146, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1844027042388916, "step": 11173 }, { "epoch": 0.1668520744518027, "grad_norm": 0.310546875, "grad_norm_var": 0.0013491312662760417, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.583172082901001, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2237354964017868, "step": 11174 }, { "epoch": 0.16686700662241766, "grad_norm": 0.2578125, "grad_norm_var": 0.0014963150024414062, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.542928457260132, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1943661943078041, "step": 11175 }, { "epoch": 0.16688193879303265, "grad_norm": 0.28125, "grad_norm_var": 0.0013707478841145833, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.7581011056900024, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18263792991638184, "step": 11176 }, { "epoch": 0.16689687096364764, "grad_norm": 0.30078125, "grad_norm_var": 0.0013188680013020834, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.493814468383789, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1791035309433937, "step": 11177 }, { "epoch": 0.1669118031342626, "grad_norm": 0.392578125, "grad_norm_var": 0.001660601298014323, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.491147518157959, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.1931299790740013, "step": 11178 }, { "epoch": 0.1669267353048776, "grad_norm": 0.283203125, "grad_norm_var": 0.0015049616495768229, "learning_rate": 0.0001, "loss": 1.2538, "loss/crossentropy": 2.385106325149536, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.15616047382354736, "step": 11179 }, { "epoch": 0.16694166747549258, "grad_norm": 0.302734375, "grad_norm_var": 0.0014286677042643229, "learning_rate": 0.0001, "loss": 1.4983, "loss/crossentropy": 2.4703508615493774, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2092410996556282, "step": 11180 }, { "epoch": 0.16695659964610757, "grad_norm": 0.302734375, "grad_norm_var": 0.001079416275024414, "learning_rate": 0.0001, "loss": 1.3669, "loss/crossentropy": 2.7906538248062134, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19109054654836655, "step": 11181 }, { "epoch": 0.16697153181672253, "grad_norm": 0.310546875, "grad_norm_var": 0.0010613600413004558, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.7865848541259766, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19171275198459625, "step": 11182 }, { "epoch": 0.16698646398733752, "grad_norm": 0.265625, "grad_norm_var": 0.0009408950805664063, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.574369430541992, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2222321331501007, "step": 11183 }, { "epoch": 0.1670013961579525, "grad_norm": 0.314453125, "grad_norm_var": 0.0009441216786702474, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.6010650396347046, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18067293614149094, "step": 11184 }, { "epoch": 0.16701632832856747, "grad_norm": 0.326171875, "grad_norm_var": 0.0009671529134114583, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.922887086868286, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2218543142080307, "step": 11185 }, { "epoch": 0.16703126049918246, "grad_norm": 0.291015625, "grad_norm_var": 0.0009361108144124349, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.7359572649002075, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19995805621147156, "step": 11186 }, { "epoch": 0.16704619266979745, "grad_norm": 0.2890625, "grad_norm_var": 0.0009087721506754557, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.5428967475891113, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16834934055805206, "step": 11187 }, { "epoch": 0.16706112484041244, "grad_norm": 0.287109375, "grad_norm_var": 0.0009139378865559896, "learning_rate": 0.0001, "loss": 1.4424, "loss/crossentropy": 2.5146719217300415, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20799598097801208, "step": 11188 }, { "epoch": 0.1670760570110274, "grad_norm": 0.365234375, "grad_norm_var": 0.0011674086252848306, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.4377434253692627, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1950937956571579, "step": 11189 }, { "epoch": 0.1670909891816424, "grad_norm": 0.28125, "grad_norm_var": 0.0011995951334635416, "learning_rate": 0.0001, "loss": 1.304, "loss/crossentropy": 2.7226332426071167, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17897947877645493, "step": 11190 }, { "epoch": 0.16710592135225738, "grad_norm": 0.310546875, "grad_norm_var": 0.0010541121164957682, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.4839563369750977, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18113476037979126, "step": 11191 }, { "epoch": 0.16712085352287234, "grad_norm": 0.287109375, "grad_norm_var": 0.0010365168253580728, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.7207902669906616, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.219623863697052, "step": 11192 }, { "epoch": 0.16713578569348733, "grad_norm": 0.59765625, "grad_norm_var": 0.00630334218343099, "learning_rate": 0.0001, "loss": 2.0966, "loss/crossentropy": 2.352232336997986, "loss/fcd": 1.734375, "loss/idx": 11.0, "loss/logits": 0.3621927350759506, "step": 11193 }, { "epoch": 0.16715071786410232, "grad_norm": 0.314453125, "grad_norm_var": 0.005985450744628906, "learning_rate": 0.0001, "loss": 1.4439, "loss/crossentropy": 2.4383572340011597, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20170958340168, "step": 11194 }, { "epoch": 0.16716565003471728, "grad_norm": 0.3046875, "grad_norm_var": 0.005907297134399414, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.744224786758423, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19859424233436584, "step": 11195 }, { "epoch": 0.16718058220533227, "grad_norm": 0.287109375, "grad_norm_var": 0.005962483088175456, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.550618052482605, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18970650434494019, "step": 11196 }, { "epoch": 0.16719551437594726, "grad_norm": 0.404296875, "grad_norm_var": 0.006360864639282227, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.712856411933899, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.23049365729093552, "step": 11197 }, { "epoch": 0.16721044654656225, "grad_norm": 0.365234375, "grad_norm_var": 0.006425841649373373, "learning_rate": 0.0001, "loss": 1.6884, "loss/crossentropy": 2.7559244632720947, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2938459515571594, "step": 11198 }, { "epoch": 0.16722537871717721, "grad_norm": 0.7265625, "grad_norm_var": 0.01570611000061035, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.545587182044983, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.22817949950695038, "step": 11199 }, { "epoch": 0.1672403108877922, "grad_norm": 0.27734375, "grad_norm_var": 0.016015052795410156, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.806199073791504, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17074665427207947, "step": 11200 }, { "epoch": 0.1672552430584072, "grad_norm": 0.328125, "grad_norm_var": 0.016007216771443684, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.479894518852234, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.19129064679145813, "step": 11201 }, { "epoch": 0.16727017522902216, "grad_norm": 0.31640625, "grad_norm_var": 0.01582310994466146, "learning_rate": 0.0001, "loss": 1.4307, "loss/crossentropy": 2.6562808752059937, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20018354058265686, "step": 11202 }, { "epoch": 0.16728510739963715, "grad_norm": 0.27734375, "grad_norm_var": 0.01594079335530599, "learning_rate": 0.0001, "loss": 1.3663, "loss/crossentropy": 2.6102243661880493, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18663232028484344, "step": 11203 }, { "epoch": 0.16730003957025213, "grad_norm": 0.29296875, "grad_norm_var": 0.01588743527730306, "learning_rate": 0.0001, "loss": 1.579, "loss/crossentropy": 2.3904924392700195, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.24693164229393005, "step": 11204 }, { "epoch": 0.16731497174086712, "grad_norm": 0.294921875, "grad_norm_var": 0.016133483250935873, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.587332010269165, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1909010261297226, "step": 11205 }, { "epoch": 0.1673299039114821, "grad_norm": 0.2890625, "grad_norm_var": 0.01606138547261556, "learning_rate": 0.0001, "loss": 1.1978, "loss/crossentropy": 2.5044960975646973, "loss/fcd": 1.04296875, "loss/idx": 11.0, "loss/logits": 0.15483567118644714, "step": 11206 }, { "epoch": 0.16734483608209708, "grad_norm": 0.2890625, "grad_norm_var": 0.016216468811035157, "learning_rate": 0.0001, "loss": 1.343, "loss/crossentropy": 2.821107506752014, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.19850261509418488, "step": 11207 }, { "epoch": 0.16735976825271207, "grad_norm": 0.259765625, "grad_norm_var": 0.016504414876302085, "learning_rate": 0.0001, "loss": 1.3379, "loss/crossentropy": 2.4077141284942627, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.19336578249931335, "step": 11208 }, { "epoch": 0.16737470042332703, "grad_norm": 0.2734375, "grad_norm_var": 0.012435849507649739, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.6089816093444824, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2111133188009262, "step": 11209 }, { "epoch": 0.16738963259394202, "grad_norm": 0.357421875, "grad_norm_var": 0.012454732259114584, "learning_rate": 0.0001, "loss": 1.5419, "loss/crossentropy": 2.4765453338623047, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.21765514463186264, "step": 11210 }, { "epoch": 0.167404564764557, "grad_norm": 0.302734375, "grad_norm_var": 0.012462600072224935, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.5481951236724854, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20318147540092468, "step": 11211 }, { "epoch": 0.16741949693517197, "grad_norm": 0.306640625, "grad_norm_var": 0.012364689509073894, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.585931658744812, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19214963912963867, "step": 11212 }, { "epoch": 0.16743442910578696, "grad_norm": 0.322265625, "grad_norm_var": 0.012028233210245768, "learning_rate": 0.0001, "loss": 1.374, "loss/crossentropy": 2.461718797683716, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.17482328414916992, "step": 11213 }, { "epoch": 0.16744936127640195, "grad_norm": 0.42578125, "grad_norm_var": 0.012542152404785156, "learning_rate": 0.0001, "loss": 1.5928, "loss/crossentropy": 2.5121132135391235, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.217751607298851, "step": 11214 }, { "epoch": 0.16746429344701694, "grad_norm": 0.294921875, "grad_norm_var": 0.0015790144602457683, "learning_rate": 0.0001, "loss": 1.4128, "loss/crossentropy": 2.4252820014953613, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20968757569789886, "step": 11215 }, { "epoch": 0.1674792256176319, "grad_norm": 0.359375, "grad_norm_var": 0.0016778151194254557, "learning_rate": 0.0001, "loss": 1.4585, "loss/crossentropy": 2.4646068811416626, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.1929214745759964, "step": 11216 }, { "epoch": 0.1674941577882469, "grad_norm": 0.345703125, "grad_norm_var": 0.0017351786295572917, "learning_rate": 0.0001, "loss": 1.8381, "loss/crossentropy": 2.1710288524627686, "loss/fcd": 1.54296875, "loss/idx": 11.0, "loss/logits": 0.29510247707366943, "step": 11217 }, { "epoch": 0.16750908995886188, "grad_norm": 0.40234375, "grad_norm_var": 0.0022359212239583333, "learning_rate": 0.0001, "loss": 1.6554, "loss/crossentropy": 2.290440082550049, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.24911073595285416, "step": 11218 }, { "epoch": 0.16752402212947684, "grad_norm": 0.32421875, "grad_norm_var": 0.002116902669270833, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.616995930671692, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21573270857334137, "step": 11219 }, { "epoch": 0.16753895430009183, "grad_norm": 0.314453125, "grad_norm_var": 0.0020646254221598306, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.5795416831970215, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20526837557554245, "step": 11220 }, { "epoch": 0.16755388647070682, "grad_norm": 0.26171875, "grad_norm_var": 0.0022562026977539064, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.738292098045349, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17158186435699463, "step": 11221 }, { "epoch": 0.1675688186413218, "grad_norm": 0.29296875, "grad_norm_var": 0.002240753173828125, "learning_rate": 0.0001, "loss": 1.5182, "loss/crossentropy": 2.558456063270569, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23307158797979355, "step": 11222 }, { "epoch": 0.16758375081193677, "grad_norm": 0.33203125, "grad_norm_var": 0.0021743138631184896, "learning_rate": 0.0001, "loss": 1.5046, "loss/crossentropy": 2.7905044555664062, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21551436185836792, "step": 11223 }, { "epoch": 0.16759868298255176, "grad_norm": 0.6328125, "grad_norm_var": 0.00770262082417806, "learning_rate": 0.0001, "loss": 2.1062, "loss/crossentropy": 2.850176215171814, "loss/fcd": 1.57421875, "loss/idx": 11.0, "loss/logits": 0.5319568440318108, "step": 11224 }, { "epoch": 0.16761361515316675, "grad_norm": 0.294921875, "grad_norm_var": 0.007521311442057292, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.6550129652023315, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.18492301553487778, "step": 11225 }, { "epoch": 0.1676285473237817, "grad_norm": 0.3125, "grad_norm_var": 0.007591867446899414, "learning_rate": 0.0001, "loss": 1.4927, "loss/crossentropy": 2.5940921306610107, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2270592749118805, "step": 11226 }, { "epoch": 0.1676434794943967, "grad_norm": 0.2890625, "grad_norm_var": 0.007681210835774739, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.54459810256958, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20897628366947174, "step": 11227 }, { "epoch": 0.1676584116650117, "grad_norm": 0.283203125, "grad_norm_var": 0.00783379872639974, "learning_rate": 0.0001, "loss": 1.3502, "loss/crossentropy": 2.637232542037964, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1822163388133049, "step": 11228 }, { "epoch": 0.16767334383562665, "grad_norm": 0.298828125, "grad_norm_var": 0.00793298085530599, "learning_rate": 0.0001, "loss": 1.4485, "loss/crossentropy": 2.4359079599380493, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19852884113788605, "step": 11229 }, { "epoch": 0.16768827600624164, "grad_norm": 0.359375, "grad_norm_var": 0.007462819417317708, "learning_rate": 0.0001, "loss": 1.6591, "loss/crossentropy": 2.4918688535690308, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.2411687895655632, "step": 11230 }, { "epoch": 0.16770320817685663, "grad_norm": 0.322265625, "grad_norm_var": 0.00735467274983724, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.780614495277405, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20594516396522522, "step": 11231 }, { "epoch": 0.16771814034747162, "grad_norm": 0.291015625, "grad_norm_var": 0.007462040583292643, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.521647095680237, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20586850494146347, "step": 11232 }, { "epoch": 0.16773307251808658, "grad_norm": 0.326171875, "grad_norm_var": 0.007457590103149414, "learning_rate": 0.0001, "loss": 1.4715, "loss/crossentropy": 2.6751744747161865, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21755805611610413, "step": 11233 }, { "epoch": 0.16774800468870157, "grad_norm": 0.30078125, "grad_norm_var": 0.00717161496480306, "learning_rate": 0.0001, "loss": 1.4134, "loss/crossentropy": 2.792916178703308, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20242342352867126, "step": 11234 }, { "epoch": 0.16776293685931656, "grad_norm": 0.32421875, "grad_norm_var": 0.00717161496480306, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.676019549369812, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19140609353780746, "step": 11235 }, { "epoch": 0.16777786902993153, "grad_norm": 0.314453125, "grad_norm_var": 0.00717161496480306, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.571009039878845, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2199193760752678, "step": 11236 }, { "epoch": 0.16779280120054652, "grad_norm": 0.26953125, "grad_norm_var": 0.007107146581013997, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.2703378200531006, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.22011487185955048, "step": 11237 }, { "epoch": 0.1678077333711615, "grad_norm": 0.30078125, "grad_norm_var": 0.007074721654256185, "learning_rate": 0.0001, "loss": 1.4186, "loss/crossentropy": 2.622628331184387, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1998894140124321, "step": 11238 }, { "epoch": 0.16782266554177647, "grad_norm": 0.263671875, "grad_norm_var": 0.0073322931925455725, "learning_rate": 0.0001, "loss": 1.3413, "loss/crossentropy": 2.395511507987976, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17719589173793793, "step": 11239 }, { "epoch": 0.16783759771239146, "grad_norm": 0.28515625, "grad_norm_var": 0.0005704243977864583, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.619298219680786, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18194817006587982, "step": 11240 }, { "epoch": 0.16785252988300645, "grad_norm": 0.298828125, "grad_norm_var": 0.0005675633748372396, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.462513566017151, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2018103152513504, "step": 11241 }, { "epoch": 0.16786746205362144, "grad_norm": 0.31640625, "grad_norm_var": 0.00057373046875, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.895916700363159, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2019701600074768, "step": 11242 }, { "epoch": 0.1678823942242364, "grad_norm": 0.3203125, "grad_norm_var": 0.0005777994791666667, "learning_rate": 0.0001, "loss": 1.6321, "loss/crossentropy": 2.402202010154724, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.21806585788726807, "step": 11243 }, { "epoch": 0.1678973263948514, "grad_norm": 0.34375, "grad_norm_var": 0.0006334781646728516, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.7181899547576904, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18237115442752838, "step": 11244 }, { "epoch": 0.16791225856546638, "grad_norm": 0.29296875, "grad_norm_var": 0.000643157958984375, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.5447932481765747, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1801397204399109, "step": 11245 }, { "epoch": 0.16792719073608134, "grad_norm": 0.31640625, "grad_norm_var": 0.00046482086181640623, "learning_rate": 0.0001, "loss": 1.4901, "loss/crossentropy": 2.6763395071029663, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21670663356781006, "step": 11246 }, { "epoch": 0.16794212290669633, "grad_norm": 0.310546875, "grad_norm_var": 0.00044708251953125, "learning_rate": 0.0001, "loss": 1.4366, "loss/crossentropy": 2.6102986335754395, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.18656454235315323, "step": 11247 }, { "epoch": 0.16795705507731132, "grad_norm": 0.328125, "grad_norm_var": 0.00046550432840983074, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.6765289306640625, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.19714319705963135, "step": 11248 }, { "epoch": 0.1679719872479263, "grad_norm": 0.30859375, "grad_norm_var": 0.0004398981730143229, "learning_rate": 0.0001, "loss": 1.4561, "loss/crossentropy": 2.4902846813201904, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20222686976194382, "step": 11249 }, { "epoch": 0.16798691941854127, "grad_norm": 0.30078125, "grad_norm_var": 0.0004398981730143229, "learning_rate": 0.0001, "loss": 1.5762, "loss/crossentropy": 2.6842516660690308, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.24802783131599426, "step": 11250 }, { "epoch": 0.16800185158915626, "grad_norm": 0.263671875, "grad_norm_var": 0.0005211989084879557, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.6851121187210083, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20817875862121582, "step": 11251 }, { "epoch": 0.16801678375977125, "grad_norm": 0.296875, "grad_norm_var": 0.0005116144816080729, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.713153600692749, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19565599411725998, "step": 11252 }, { "epoch": 0.1680317159303862, "grad_norm": 0.283203125, "grad_norm_var": 0.0004658857981363932, "learning_rate": 0.0001, "loss": 1.4325, "loss/crossentropy": 2.618117928504944, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20979388058185577, "step": 11253 }, { "epoch": 0.1680466481010012, "grad_norm": 0.3359375, "grad_norm_var": 0.0005379835764567057, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.6625324487686157, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.25519372522830963, "step": 11254 }, { "epoch": 0.1680615802716162, "grad_norm": 0.3671875, "grad_norm_var": 0.0006500244140625, "learning_rate": 0.0001, "loss": 1.6719, "loss/crossentropy": 2.748431921005249, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2851349711418152, "step": 11255 }, { "epoch": 0.16807651244223115, "grad_norm": 0.294921875, "grad_norm_var": 0.0006229241689046223, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.6540262699127197, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20424029231071472, "step": 11256 }, { "epoch": 0.16809144461284614, "grad_norm": 0.3359375, "grad_norm_var": 0.0006479899088541667, "learning_rate": 0.0001, "loss": 1.3591, "loss/crossentropy": 2.615196704864502, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.16379259526729584, "step": 11257 }, { "epoch": 0.16810637678346113, "grad_norm": 0.28515625, "grad_norm_var": 0.0006968180338541666, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.5170687437057495, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20572121441364288, "step": 11258 }, { "epoch": 0.16812130895407612, "grad_norm": 0.302734375, "grad_norm_var": 0.0006955305735270183, "learning_rate": 0.0001, "loss": 1.4342, "loss/crossentropy": 2.4974268674850464, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20370230078697205, "step": 11259 }, { "epoch": 0.16813624112469108, "grad_norm": 0.263671875, "grad_norm_var": 0.0007404963175455729, "learning_rate": 0.0001, "loss": 1.2861, "loss/crossentropy": 2.4674333333969116, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16502176970243454, "step": 11260 }, { "epoch": 0.16815117329530607, "grad_norm": 0.314453125, "grad_norm_var": 0.0007336775461832683, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.4242178201675415, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.19868981838226318, "step": 11261 }, { "epoch": 0.16816610546592106, "grad_norm": 0.306640625, "grad_norm_var": 0.000727081298828125, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.7636961936950684, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.16337388008832932, "step": 11262 }, { "epoch": 0.16818103763653602, "grad_norm": 0.302734375, "grad_norm_var": 0.000726318359375, "learning_rate": 0.0001, "loss": 1.3795, "loss/crossentropy": 2.4855923652648926, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1919863298535347, "step": 11263 }, { "epoch": 0.168195969807151, "grad_norm": 0.291015625, "grad_norm_var": 0.0007012526194254558, "learning_rate": 0.0001, "loss": 1.3461, "loss/crossentropy": 2.473868489265442, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18589740991592407, "step": 11264 }, { "epoch": 0.168210901977766, "grad_norm": 0.294921875, "grad_norm_var": 0.0007033665974934896, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.6413915157318115, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18413282930850983, "step": 11265 }, { "epoch": 0.168225834148381, "grad_norm": 0.29296875, "grad_norm_var": 0.0007089614868164062, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.650281071662903, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19778770208358765, "step": 11266 }, { "epoch": 0.16824076631899595, "grad_norm": 0.283203125, "grad_norm_var": 0.0006329854329427083, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.629088521003723, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19968149065971375, "step": 11267 }, { "epoch": 0.16825569848961094, "grad_norm": 0.2734375, "grad_norm_var": 0.0006871541341145833, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.649331569671631, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.20467153191566467, "step": 11268 }, { "epoch": 0.16827063066022593, "grad_norm": 0.31640625, "grad_norm_var": 0.0006739139556884765, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.755821704864502, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21094805747270584, "step": 11269 }, { "epoch": 0.1682855628308409, "grad_norm": 0.283203125, "grad_norm_var": 0.0006219863891601563, "learning_rate": 0.0001, "loss": 1.3942, "loss/crossentropy": 2.464492678642273, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19498394429683685, "step": 11270 }, { "epoch": 0.16830049500145589, "grad_norm": 0.32421875, "grad_norm_var": 0.00035552978515625, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.986533761024475, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1838766634464264, "step": 11271 }, { "epoch": 0.16831542717207088, "grad_norm": 0.279296875, "grad_norm_var": 0.00037689208984375, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.476368546485901, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19879595190286636, "step": 11272 }, { "epoch": 0.16833035934268584, "grad_norm": 0.328125, "grad_norm_var": 0.00034001668294270836, "learning_rate": 0.0001, "loss": 1.3509, "loss/crossentropy": 2.691397786140442, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1829264909029007, "step": 11273 }, { "epoch": 0.16834529151330083, "grad_norm": 0.318359375, "grad_norm_var": 0.00035920143127441404, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.580561876296997, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2004140540957451, "step": 11274 }, { "epoch": 0.16836022368391582, "grad_norm": 0.298828125, "grad_norm_var": 0.0003579298655192057, "learning_rate": 0.0001, "loss": 1.247, "loss/crossentropy": 2.6363474130630493, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.15320981293916702, "step": 11275 }, { "epoch": 0.1683751558545308, "grad_norm": 0.36328125, "grad_norm_var": 0.0005192438761393229, "learning_rate": 0.0001, "loss": 1.5377, "loss/crossentropy": 2.4575968980789185, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.23303701728582382, "step": 11276 }, { "epoch": 0.16839008802514577, "grad_norm": 0.470703125, "grad_norm_var": 0.0022536595662434895, "learning_rate": 0.0001, "loss": 1.4988, "loss/crossentropy": 2.2449299097061157, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.1902027204632759, "step": 11277 }, { "epoch": 0.16840502019576076, "grad_norm": 0.287109375, "grad_norm_var": 0.002297210693359375, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.6440993547439575, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21528510004281998, "step": 11278 }, { "epoch": 0.16841995236637575, "grad_norm": 0.296875, "grad_norm_var": 0.0023073673248291014, "learning_rate": 0.0001, "loss": 1.3132, "loss/crossentropy": 2.648463726043701, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.18036441504955292, "step": 11279 }, { "epoch": 0.1684348845369907, "grad_norm": 0.341796875, "grad_norm_var": 0.0023222446441650392, "learning_rate": 0.0001, "loss": 1.479, "loss/crossentropy": 2.703896164894104, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.22121450304985046, "step": 11280 }, { "epoch": 0.1684498167076057, "grad_norm": 0.427734375, "grad_norm_var": 0.0030550479888916014, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.548880100250244, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.23265084624290466, "step": 11281 }, { "epoch": 0.1684647488782207, "grad_norm": 0.30078125, "grad_norm_var": 0.003026437759399414, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.635399103164673, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18633365631103516, "step": 11282 }, { "epoch": 0.16847968104883568, "grad_norm": 0.365234375, "grad_norm_var": 0.002994394302368164, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.518557071685791, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.21923603117465973, "step": 11283 }, { "epoch": 0.16849461321945064, "grad_norm": 0.74609375, "grad_norm_var": 0.013410679499308268, "learning_rate": 0.0001, "loss": 1.6267, "loss/crossentropy": 2.2590692043304443, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.22822271287441254, "step": 11284 }, { "epoch": 0.16850954539006563, "grad_norm": 0.279296875, "grad_norm_var": 0.013708750406901041, "learning_rate": 0.0001, "loss": 1.2496, "loss/crossentropy": 2.4620155096054077, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.15580351650714874, "step": 11285 }, { "epoch": 0.16852447756068062, "grad_norm": 0.306640625, "grad_norm_var": 0.013512674967447917, "learning_rate": 0.0001, "loss": 1.4095, "loss/crossentropy": 2.470197319984436, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20243582874536514, "step": 11286 }, { "epoch": 0.16853940973129558, "grad_norm": 0.3046875, "grad_norm_var": 0.013625526428222656, "learning_rate": 0.0001, "loss": 1.3206, "loss/crossentropy": 2.673904061317444, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17611442506313324, "step": 11287 }, { "epoch": 0.16855434190191057, "grad_norm": 0.34375, "grad_norm_var": 0.013215875625610352, "learning_rate": 0.0001, "loss": 1.5155, "loss/crossentropy": 2.6594674587249756, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.19912546128034592, "step": 11288 }, { "epoch": 0.16856927407252556, "grad_norm": 0.291015625, "grad_norm_var": 0.013465627034505209, "learning_rate": 0.0001, "loss": 1.4285, "loss/crossentropy": 2.6979016065597534, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20580358803272247, "step": 11289 }, { "epoch": 0.16858420624314052, "grad_norm": 0.298828125, "grad_norm_var": 0.013595008850097656, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.5784285068511963, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.16293465346097946, "step": 11290 }, { "epoch": 0.1685991384137555, "grad_norm": 0.291015625, "grad_norm_var": 0.013660113016764322, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.4930449724197388, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21462299674749374, "step": 11291 }, { "epoch": 0.1686140705843705, "grad_norm": 0.2890625, "grad_norm_var": 0.013943990071614584, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.68938672542572, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21180693060159683, "step": 11292 }, { "epoch": 0.1686290027549855, "grad_norm": 0.2890625, "grad_norm_var": 0.013144286473592122, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.5374648571014404, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1817513182759285, "step": 11293 }, { "epoch": 0.16864393492560045, "grad_norm": 0.263671875, "grad_norm_var": 0.013347609837849935, "learning_rate": 0.0001, "loss": 1.4554, "loss/crossentropy": 2.4192153215408325, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20541688799858093, "step": 11294 }, { "epoch": 0.16865886709621544, "grad_norm": 0.296875, "grad_norm_var": 0.013347609837849935, "learning_rate": 0.0001, "loss": 1.3416, "loss/crossentropy": 2.615100622177124, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1775275617837906, "step": 11295 }, { "epoch": 0.16867379926683043, "grad_norm": 0.326171875, "grad_norm_var": 0.013358545303344727, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 2.646006464958191, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19216316938400269, "step": 11296 }, { "epoch": 0.1686887314374454, "grad_norm": 0.306640625, "grad_norm_var": 0.012838220596313477, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.5872678756713867, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23247060179710388, "step": 11297 }, { "epoch": 0.16870366360806038, "grad_norm": 0.341796875, "grad_norm_var": 0.012777137756347656, "learning_rate": 0.0001, "loss": 1.6012, "loss/crossentropy": 2.6784595251083374, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2456812784075737, "step": 11298 }, { "epoch": 0.16871859577867537, "grad_norm": 0.28515625, "grad_norm_var": 0.01284165382385254, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.6570308208465576, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19548102468252182, "step": 11299 }, { "epoch": 0.16873352794929034, "grad_norm": 0.326171875, "grad_norm_var": 0.0004948298136393229, "learning_rate": 0.0001, "loss": 1.4608, "loss/crossentropy": 2.3353248834609985, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.191295325756073, "step": 11300 }, { "epoch": 0.16874846011990532, "grad_norm": 0.2890625, "grad_norm_var": 0.00047059059143066405, "learning_rate": 0.0001, "loss": 1.3845, "loss/crossentropy": 2.671210289001465, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19702325761318207, "step": 11301 }, { "epoch": 0.16876339229052031, "grad_norm": 0.283203125, "grad_norm_var": 0.0004938602447509766, "learning_rate": 0.0001, "loss": 1.3774, "loss/crossentropy": 2.6648666858673096, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1860307678580284, "step": 11302 }, { "epoch": 0.1687783244611353, "grad_norm": 0.306640625, "grad_norm_var": 0.0004948933919270833, "learning_rate": 0.0001, "loss": 1.4369, "loss/crossentropy": 2.659795045852661, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2142351269721985, "step": 11303 }, { "epoch": 0.16879325663175027, "grad_norm": 0.333984375, "grad_norm_var": 0.0004461765289306641, "learning_rate": 0.0001, "loss": 1.3959, "loss/crossentropy": 2.708907961845398, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19672434777021408, "step": 11304 }, { "epoch": 0.16880818880236526, "grad_norm": 0.333984375, "grad_norm_var": 0.0005035241444905599, "learning_rate": 0.0001, "loss": 1.5411, "loss/crossentropy": 2.7869057655334473, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2285740226507187, "step": 11305 }, { "epoch": 0.16882312097298025, "grad_norm": 0.3359375, "grad_norm_var": 0.0005648295084635417, "learning_rate": 0.0001, "loss": 1.6355, "loss/crossentropy": 2.50240421295166, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.23702390491962433, "step": 11306 }, { "epoch": 0.1688380531435952, "grad_norm": 0.30859375, "grad_norm_var": 0.0005486647288004558, "learning_rate": 0.0001, "loss": 1.49, "loss/crossentropy": 2.5675615072250366, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2165631800889969, "step": 11307 }, { "epoch": 0.1688529853142102, "grad_norm": 0.294921875, "grad_norm_var": 0.0005366007486979166, "learning_rate": 0.0001, "loss": 1.4031, "loss/crossentropy": 2.7977579832077026, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20775257050991058, "step": 11308 }, { "epoch": 0.1688679174848252, "grad_norm": 0.306640625, "grad_norm_var": 0.0005124251047770183, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.563469648361206, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2042045071721077, "step": 11309 }, { "epoch": 0.16888284965544018, "grad_norm": 0.390625, "grad_norm_var": 0.000757280985514323, "learning_rate": 0.0001, "loss": 1.5541, "loss/crossentropy": 2.1224316358566284, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.18299932777881622, "step": 11310 }, { "epoch": 0.16889778182605514, "grad_norm": 0.302734375, "grad_norm_var": 0.0007439772288004557, "learning_rate": 0.0001, "loss": 1.3884, "loss/crossentropy": 2.5418368577957153, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1969779133796692, "step": 11311 }, { "epoch": 0.16891271399667013, "grad_norm": 0.3125, "grad_norm_var": 0.0007389704386393229, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.5921212434768677, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17100698500871658, "step": 11312 }, { "epoch": 0.16892764616728512, "grad_norm": 0.314453125, "grad_norm_var": 0.0007328669230143229, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.590090036392212, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2142152488231659, "step": 11313 }, { "epoch": 0.16894257833790008, "grad_norm": 0.322265625, "grad_norm_var": 0.00069122314453125, "learning_rate": 0.0001, "loss": 1.4105, "loss/crossentropy": 2.6624733209609985, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.17610663175582886, "step": 11314 }, { "epoch": 0.16895751050851507, "grad_norm": 0.333984375, "grad_norm_var": 0.0006431420644124349, "learning_rate": 0.0001, "loss": 1.4386, "loss/crossentropy": 2.6279128789901733, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21205507218837738, "step": 11315 }, { "epoch": 0.16897244267913006, "grad_norm": 0.3359375, "grad_norm_var": 0.0006591161092122396, "learning_rate": 0.0001, "loss": 1.4233, "loss/crossentropy": 2.7323710918426514, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21240371465682983, "step": 11316 }, { "epoch": 0.16898737484974502, "grad_norm": 0.318359375, "grad_norm_var": 0.0005954583485921224, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.3376147747039795, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22046002745628357, "step": 11317 }, { "epoch": 0.16900230702036, "grad_norm": 0.70703125, "grad_norm_var": 0.009690793355305989, "learning_rate": 0.0001, "loss": 2.003, "loss/crossentropy": 2.4609912633895874, "loss/fcd": 1.6640625, "loss/idx": 11.0, "loss/logits": 0.33889109641313553, "step": 11318 }, { "epoch": 0.169017239190975, "grad_norm": 0.29296875, "grad_norm_var": 0.009776798884073894, "learning_rate": 0.0001, "loss": 1.5542, "loss/crossentropy": 2.5555144548416138, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2416730746626854, "step": 11319 }, { "epoch": 0.16903217136159, "grad_norm": 0.38671875, "grad_norm_var": 0.009862200419108073, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.5427955389022827, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.24802611023187637, "step": 11320 }, { "epoch": 0.16904710353220495, "grad_norm": 0.359375, "grad_norm_var": 0.009848769505818684, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.5753889083862305, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22197184711694717, "step": 11321 }, { "epoch": 0.16906203570281994, "grad_norm": 0.28125, "grad_norm_var": 0.010148731867472331, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.488073229789734, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20171085745096207, "step": 11322 }, { "epoch": 0.16907696787343493, "grad_norm": 0.3125, "grad_norm_var": 0.010129149754842122, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.959079384803772, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.17987271398305893, "step": 11323 }, { "epoch": 0.1690919000440499, "grad_norm": 0.31640625, "grad_norm_var": 0.01000518798828125, "learning_rate": 0.0001, "loss": 1.4786, "loss/crossentropy": 2.4267122745513916, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20907729864120483, "step": 11324 }, { "epoch": 0.16910683221466488, "grad_norm": 0.353515625, "grad_norm_var": 0.00987396240234375, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.4760560989379883, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.19998915493488312, "step": 11325 }, { "epoch": 0.16912176438527987, "grad_norm": 0.3671875, "grad_norm_var": 0.009789276123046874, "learning_rate": 0.0001, "loss": 1.4819, "loss/crossentropy": 2.71607506275177, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21629994362592697, "step": 11326 }, { "epoch": 0.16913669655589486, "grad_norm": 0.349609375, "grad_norm_var": 0.009624481201171875, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.556922197341919, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21817222237586975, "step": 11327 }, { "epoch": 0.16915162872650982, "grad_norm": 0.369140625, "grad_norm_var": 0.009511550267537435, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.4588871002197266, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.25869986414909363, "step": 11328 }, { "epoch": 0.1691665608971248, "grad_norm": 0.306640625, "grad_norm_var": 0.009560251235961914, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.4441046714782715, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21103136241436005, "step": 11329 }, { "epoch": 0.1691814930677398, "grad_norm": 0.416015625, "grad_norm_var": 0.009674692153930664, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.6921600103378296, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1592121422290802, "step": 11330 }, { "epoch": 0.16919642523835476, "grad_norm": 0.318359375, "grad_norm_var": 0.00975022315979004, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.586800694465637, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.20839284360408783, "step": 11331 }, { "epoch": 0.16921135740896975, "grad_norm": 0.353515625, "grad_norm_var": 0.009708595275878907, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.3281986713409424, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2034563645720482, "step": 11332 }, { "epoch": 0.16922628957958474, "grad_norm": 0.310546875, "grad_norm_var": 0.009758949279785156, "learning_rate": 0.0001, "loss": 1.4339, "loss/crossentropy": 2.7140945196151733, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20340687781572342, "step": 11333 }, { "epoch": 0.1692412217501997, "grad_norm": 0.302734375, "grad_norm_var": 0.001405191421508789, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.63227117061615, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2054319903254509, "step": 11334 }, { "epoch": 0.1692561539208147, "grad_norm": 0.2890625, "grad_norm_var": 0.0014292240142822266, "learning_rate": 0.0001, "loss": 1.3089, "loss/crossentropy": 2.772392988204956, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1760861799120903, "step": 11335 }, { "epoch": 0.16927108609142968, "grad_norm": 0.57421875, "grad_norm_var": 0.004868555068969727, "learning_rate": 0.0001, "loss": 1.9423, "loss/crossentropy": 2.478410005569458, "loss/fcd": 1.62890625, "loss/idx": 11.0, "loss/logits": 0.3133868873119354, "step": 11336 }, { "epoch": 0.16928601826204467, "grad_norm": 0.294921875, "grad_norm_var": 0.00503692626953125, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.5723793506622314, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1882561892271042, "step": 11337 }, { "epoch": 0.16930095043265964, "grad_norm": 0.330078125, "grad_norm_var": 0.004772679011027018, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.5066866874694824, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19555717706680298, "step": 11338 }, { "epoch": 0.16931588260327463, "grad_norm": 0.28125, "grad_norm_var": 0.004980707168579101, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.7414215803146362, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17657680809497833, "step": 11339 }, { "epoch": 0.16933081477388962, "grad_norm": 0.3046875, "grad_norm_var": 0.005035257339477539, "learning_rate": 0.0001, "loss": 1.4239, "loss/crossentropy": 2.5824025869369507, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20514799654483795, "step": 11340 }, { "epoch": 0.16934574694450458, "grad_norm": 0.2734375, "grad_norm_var": 0.005346107482910156, "learning_rate": 0.0001, "loss": 1.2901, "loss/crossentropy": 2.5904074907302856, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1729433760046959, "step": 11341 }, { "epoch": 0.16936067911511957, "grad_norm": 0.27734375, "grad_norm_var": 0.005525970458984375, "learning_rate": 0.0001, "loss": 1.468, "loss/crossentropy": 2.5866568088531494, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.22194980084896088, "step": 11342 }, { "epoch": 0.16937561128573456, "grad_norm": 0.310546875, "grad_norm_var": 0.0055425008138020836, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.6055383682250977, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17757821828126907, "step": 11343 }, { "epoch": 0.16939054345634955, "grad_norm": 0.4453125, "grad_norm_var": 0.00628202756245931, "learning_rate": 0.0001, "loss": 1.7517, "loss/crossentropy": 2.3552427291870117, "loss/fcd": 1.46875, "loss/idx": 11.0, "loss/logits": 0.2829367071390152, "step": 11344 }, { "epoch": 0.1694054756269645, "grad_norm": 0.33984375, "grad_norm_var": 0.006217447916666666, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.7097803354263306, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18485558032989502, "step": 11345 }, { "epoch": 0.1694204077975795, "grad_norm": 0.328125, "grad_norm_var": 0.005796162287394205, "learning_rate": 0.0001, "loss": 1.5591, "loss/crossentropy": 2.5311182737350464, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2270602211356163, "step": 11346 }, { "epoch": 0.1694353399681945, "grad_norm": 0.3203125, "grad_norm_var": 0.005792490641276042, "learning_rate": 0.0001, "loss": 1.6021, "loss/crossentropy": 2.2632205486297607, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.24666912853717804, "step": 11347 }, { "epoch": 0.16945027213880945, "grad_norm": 0.283203125, "grad_norm_var": 0.005913798014322917, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.602471709251404, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17649231106042862, "step": 11348 }, { "epoch": 0.16946520430942444, "grad_norm": 0.2890625, "grad_norm_var": 0.005995798110961914, "learning_rate": 0.0001, "loss": 1.4745, "loss/crossentropy": 2.4286437034606934, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20103368908166885, "step": 11349 }, { "epoch": 0.16948013648003943, "grad_norm": 0.404296875, "grad_norm_var": 0.006301609675089518, "learning_rate": 0.0001, "loss": 1.8356, "loss/crossentropy": 2.379179835319519, "loss/fcd": 1.5546875, "loss/idx": 11.0, "loss/logits": 0.280933253467083, "step": 11350 }, { "epoch": 0.1694950686506544, "grad_norm": 0.294921875, "grad_norm_var": 0.006268564860026042, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.728244662284851, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20032083243131638, "step": 11351 }, { "epoch": 0.16951000082126938, "grad_norm": 0.302734375, "grad_norm_var": 0.0021967411041259764, "learning_rate": 0.0001, "loss": 1.3759, "loss/crossentropy": 2.5583555698394775, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.2079111486673355, "step": 11352 }, { "epoch": 0.16952493299188437, "grad_norm": 0.427734375, "grad_norm_var": 0.002899281183878581, "learning_rate": 0.0001, "loss": 1.6695, "loss/crossentropy": 2.561730980873108, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.2671428620815277, "step": 11353 }, { "epoch": 0.16953986516249936, "grad_norm": 0.2890625, "grad_norm_var": 0.0029810587565104168, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.3758312463760376, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18419302999973297, "step": 11354 }, { "epoch": 0.16955479733311432, "grad_norm": 0.28125, "grad_norm_var": 0.0029810587565104168, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.570099115371704, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16929586976766586, "step": 11355 }, { "epoch": 0.1695697295037293, "grad_norm": 0.28125, "grad_norm_var": 0.0030733744303385415, "learning_rate": 0.0001, "loss": 1.3403, "loss/crossentropy": 2.6070226430892944, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1800984889268875, "step": 11356 }, { "epoch": 0.1695846616743443, "grad_norm": 0.376953125, "grad_norm_var": 0.003075901667277018, "learning_rate": 0.0001, "loss": 1.6625, "loss/crossentropy": 2.4603350162506104, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2757546156644821, "step": 11357 }, { "epoch": 0.16959959384495926, "grad_norm": 0.31640625, "grad_norm_var": 0.0029061476389567057, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.4716612100601196, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.19931408762931824, "step": 11358 }, { "epoch": 0.16961452601557425, "grad_norm": 0.365234375, "grad_norm_var": 0.002946201960245768, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.643397569656372, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19946406781673431, "step": 11359 }, { "epoch": 0.16962945818618924, "grad_norm": 0.33203125, "grad_norm_var": 0.0020685672760009767, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.682942509651184, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2086215242743492, "step": 11360 }, { "epoch": 0.1696443903568042, "grad_norm": 0.357421875, "grad_norm_var": 0.002117919921875, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.75070858001709, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19909942895174026, "step": 11361 }, { "epoch": 0.1696593225274192, "grad_norm": 0.3046875, "grad_norm_var": 0.002152252197265625, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.4981669187545776, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1751486137509346, "step": 11362 }, { "epoch": 0.16967425469803418, "grad_norm": 0.287109375, "grad_norm_var": 0.002249256769816081, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.386420488357544, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17699872702360153, "step": 11363 }, { "epoch": 0.16968918686864917, "grad_norm": 0.275390625, "grad_norm_var": 0.0022961775461832683, "learning_rate": 0.0001, "loss": 1.2582, "loss/crossentropy": 2.576873779296875, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.16834937036037445, "step": 11364 }, { "epoch": 0.16970411903926413, "grad_norm": 0.294921875, "grad_norm_var": 0.002270952860514323, "learning_rate": 0.0001, "loss": 1.4409, "loss/crossentropy": 2.6907237768173218, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21042509377002716, "step": 11365 }, { "epoch": 0.16971905120987912, "grad_norm": 0.3671875, "grad_norm_var": 0.001962010065714518, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.3580970764160156, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.23380118608474731, "step": 11366 }, { "epoch": 0.1697339833804941, "grad_norm": 0.435546875, "grad_norm_var": 0.0026875654856363933, "learning_rate": 0.0001, "loss": 1.6518, "loss/crossentropy": 2.9293335676193237, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2572972774505615, "step": 11367 }, { "epoch": 0.16974891555110908, "grad_norm": 0.326171875, "grad_norm_var": 0.0026337782541910808, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.552623987197876, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21474380791187286, "step": 11368 }, { "epoch": 0.16976384772172406, "grad_norm": 0.314453125, "grad_norm_var": 0.00199583371480306, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.6994141340255737, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20662156492471695, "step": 11369 }, { "epoch": 0.16977877989233905, "grad_norm": 0.30859375, "grad_norm_var": 0.0019252618153889974, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 2.553329348564148, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1831628605723381, "step": 11370 }, { "epoch": 0.16979371206295404, "grad_norm": 0.296875, "grad_norm_var": 0.001846170425415039, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.5444343090057373, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19652382284402847, "step": 11371 }, { "epoch": 0.169808644233569, "grad_norm": 0.337890625, "grad_norm_var": 0.0016972859700520833, "learning_rate": 0.0001, "loss": 1.5989, "loss/crossentropy": 2.3522634506225586, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.24735428392887115, "step": 11372 }, { "epoch": 0.169823576404184, "grad_norm": 0.294921875, "grad_norm_var": 0.0016158421834309895, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.5937485694885254, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18758701533079147, "step": 11373 }, { "epoch": 0.16983850857479899, "grad_norm": 0.310546875, "grad_norm_var": 0.0016254266103108725, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.635282278060913, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19470267742872238, "step": 11374 }, { "epoch": 0.16985344074541395, "grad_norm": 0.296875, "grad_norm_var": 0.0015558878580729167, "learning_rate": 0.0001, "loss": 1.3584, "loss/crossentropy": 2.611207604408264, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19039595127105713, "step": 11375 }, { "epoch": 0.16986837291602894, "grad_norm": 0.28515625, "grad_norm_var": 0.0016260782877604166, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.6070451736450195, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19016969203948975, "step": 11376 }, { "epoch": 0.16988330508664393, "grad_norm": 0.45703125, "grad_norm_var": 0.0027650038401285807, "learning_rate": 0.0001, "loss": 1.5878, "loss/crossentropy": 2.5334194898605347, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2401605322957039, "step": 11377 }, { "epoch": 0.1698982372572589, "grad_norm": 0.296875, "grad_norm_var": 0.0027895450592041017, "learning_rate": 0.0001, "loss": 1.3059, "loss/crossentropy": 2.513101100921631, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16922490298748016, "step": 11378 }, { "epoch": 0.16991316942787388, "grad_norm": 0.353515625, "grad_norm_var": 0.0027376651763916016, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.655567169189453, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20332707464694977, "step": 11379 }, { "epoch": 0.16992810159848887, "grad_norm": 0.349609375, "grad_norm_var": 0.00255888303120931, "learning_rate": 0.0001, "loss": 1.2982, "loss/crossentropy": 2.4253649711608887, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1537143960595131, "step": 11380 }, { "epoch": 0.16994303376910386, "grad_norm": 0.44140625, "grad_norm_var": 0.0031585057576497396, "learning_rate": 0.0001, "loss": 1.8431, "loss/crossentropy": 2.4842449426651, "loss/fcd": 1.53515625, "loss/idx": 11.0, "loss/logits": 0.3079473525285721, "step": 11381 }, { "epoch": 0.16995796593971882, "grad_norm": 0.314453125, "grad_norm_var": 0.00315550168355306, "learning_rate": 0.0001, "loss": 1.4643, "loss/crossentropy": 2.482966661453247, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19871283322572708, "step": 11382 }, { "epoch": 0.1699728981103338, "grad_norm": 0.2734375, "grad_norm_var": 0.0027056376139322916, "learning_rate": 0.0001, "loss": 1.4217, "loss/crossentropy": 2.5196545124053955, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20291777700185776, "step": 11383 }, { "epoch": 0.1699878302809488, "grad_norm": 0.375, "grad_norm_var": 0.002838754653930664, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.5764533281326294, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1878291368484497, "step": 11384 }, { "epoch": 0.17000276245156376, "grad_norm": 0.310546875, "grad_norm_var": 0.0028486728668212892, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.7000784873962402, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2604993134737015, "step": 11385 }, { "epoch": 0.17001769462217875, "grad_norm": 0.296875, "grad_norm_var": 0.002892923355102539, "learning_rate": 0.0001, "loss": 1.4998, "loss/crossentropy": 2.597913980484009, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21465905010700226, "step": 11386 }, { "epoch": 0.17003262679279374, "grad_norm": 0.296875, "grad_norm_var": 0.002892923355102539, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.399269461631775, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.16868546605110168, "step": 11387 }, { "epoch": 0.17004755896340873, "grad_norm": 0.33984375, "grad_norm_var": 0.0028950373331705728, "learning_rate": 0.0001, "loss": 1.4649, "loss/crossentropy": 2.530832529067993, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21490097045898438, "step": 11388 }, { "epoch": 0.1700624911340237, "grad_norm": 0.337890625, "grad_norm_var": 0.0028048197428385416, "learning_rate": 0.0001, "loss": 1.3881, "loss/crossentropy": 2.646738648414612, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18500656634569168, "step": 11389 }, { "epoch": 0.17007742330463868, "grad_norm": 0.33203125, "grad_norm_var": 0.0027679284413655598, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.344831109046936, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22644054889678955, "step": 11390 }, { "epoch": 0.17009235547525367, "grad_norm": 0.296875, "grad_norm_var": 0.0027679284413655598, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.3473631143569946, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21168296039104462, "step": 11391 }, { "epoch": 0.17010728764586863, "grad_norm": 0.328125, "grad_norm_var": 0.0025986830393473306, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.6915276050567627, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2122957557439804, "step": 11392 }, { "epoch": 0.17012221981648362, "grad_norm": 0.28515625, "grad_norm_var": 0.0017062981923421225, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.654489517211914, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20314088463783264, "step": 11393 }, { "epoch": 0.1701371519870986, "grad_norm": 0.33984375, "grad_norm_var": 0.0016503492991129557, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 2.710642099380493, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20872271806001663, "step": 11394 }, { "epoch": 0.17015208415771357, "grad_norm": 0.322265625, "grad_norm_var": 0.001611185073852539, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.5551153421401978, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19174925982952118, "step": 11395 }, { "epoch": 0.17016701632832856, "grad_norm": 0.279296875, "grad_norm_var": 0.0017130374908447266, "learning_rate": 0.0001, "loss": 1.2837, "loss/crossentropy": 2.548976421356201, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16648676991462708, "step": 11396 }, { "epoch": 0.17018194849894355, "grad_norm": 0.5703125, "grad_norm_var": 0.004784631729125977, "learning_rate": 0.0001, "loss": 1.8879, "loss/crossentropy": 2.696822762489319, "loss/fcd": 1.578125, "loss/idx": 11.0, "loss/logits": 0.3097967803478241, "step": 11397 }, { "epoch": 0.17019688066955854, "grad_norm": 0.6484375, "grad_norm_var": 0.011011505126953125, "learning_rate": 0.0001, "loss": 1.5763, "loss/crossentropy": 2.490721821784973, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.24422737210988998, "step": 11398 }, { "epoch": 0.1702118128401735, "grad_norm": 0.3828125, "grad_norm_var": 0.010612742106119791, "learning_rate": 0.0001, "loss": 1.672, "loss/crossentropy": 2.7405827045440674, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2774997353553772, "step": 11399 }, { "epoch": 0.1702267450107885, "grad_norm": 0.283203125, "grad_norm_var": 0.010942188898722331, "learning_rate": 0.0001, "loss": 1.2709, "loss/crossentropy": 2.5668890476226807, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16156842559576035, "step": 11400 }, { "epoch": 0.17024167718140348, "grad_norm": 0.3125, "grad_norm_var": 0.01093133290608724, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.8499867916107178, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19983895868062973, "step": 11401 }, { "epoch": 0.17025660935201845, "grad_norm": 0.28125, "grad_norm_var": 0.011064084370930989, "learning_rate": 0.0001, "loss": 1.3157, "loss/crossentropy": 2.510955810546875, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17115283012390137, "step": 11402 }, { "epoch": 0.17027154152263344, "grad_norm": 0.37109375, "grad_norm_var": 0.010859934488932292, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.497067451477051, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21731740981340408, "step": 11403 }, { "epoch": 0.17028647369324842, "grad_norm": 0.39453125, "grad_norm_var": 0.0109222412109375, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.6240999698638916, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.22059761732816696, "step": 11404 }, { "epoch": 0.17030140586386341, "grad_norm": 0.310546875, "grad_norm_var": 0.011050860087076822, "learning_rate": 0.0001, "loss": 1.6677, "loss/crossentropy": 2.4352437257766724, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.2536487355828285, "step": 11405 }, { "epoch": 0.17031633803447838, "grad_norm": 0.2890625, "grad_norm_var": 0.011318715413411458, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.5978413820266724, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19397204369306564, "step": 11406 }, { "epoch": 0.17033127020509337, "grad_norm": 0.384765625, "grad_norm_var": 0.011109145482381184, "learning_rate": 0.0001, "loss": 1.512, "loss/crossentropy": 2.7305922508239746, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21902736276388168, "step": 11407 }, { "epoch": 0.17034620237570836, "grad_norm": 0.279296875, "grad_norm_var": 0.011475118001302083, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.5076311826705933, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1805264949798584, "step": 11408 }, { "epoch": 0.17036113454632332, "grad_norm": 0.3125, "grad_norm_var": 0.011254819234212239, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.652352213859558, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19064638018608093, "step": 11409 }, { "epoch": 0.1703760667169383, "grad_norm": 0.298828125, "grad_norm_var": 0.011470778783162435, "learning_rate": 0.0001, "loss": 1.5027, "loss/crossentropy": 2.6137903928756714, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2214195653796196, "step": 11410 }, { "epoch": 0.1703909988875533, "grad_norm": 0.302734375, "grad_norm_var": 0.011586491266886394, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.4681334495544434, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20638514310121536, "step": 11411 }, { "epoch": 0.17040593105816826, "grad_norm": 0.310546875, "grad_norm_var": 0.01132658322652181, "learning_rate": 0.0001, "loss": 1.35, "loss/crossentropy": 2.629005789756775, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18202655017375946, "step": 11412 }, { "epoch": 0.17042086322878325, "grad_norm": 0.302734375, "grad_norm_var": 0.00823663075764974, "learning_rate": 0.0001, "loss": 1.4114, "loss/crossentropy": 2.5134761333465576, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18874313682317734, "step": 11413 }, { "epoch": 0.17043579539939824, "grad_norm": 0.486328125, "grad_norm_var": 0.0032459100087483725, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.92424738407135, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18152400106191635, "step": 11414 }, { "epoch": 0.17045072757001323, "grad_norm": 0.33984375, "grad_norm_var": 0.003066873550415039, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.6877424716949463, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2217455953359604, "step": 11415 }, { "epoch": 0.1704656597406282, "grad_norm": 0.30859375, "grad_norm_var": 0.002953020731608073, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.4689525365829468, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19850514829158783, "step": 11416 }, { "epoch": 0.17048059191124318, "grad_norm": 0.62109375, "grad_norm_var": 0.008171590169270833, "learning_rate": 0.0001, "loss": 1.5852, "loss/crossentropy": 2.6757642030715942, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.21407873183488846, "step": 11417 }, { "epoch": 0.17049552408185817, "grad_norm": 0.302734375, "grad_norm_var": 0.00800461769104004, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.6485302448272705, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.172447070479393, "step": 11418 }, { "epoch": 0.17051045625247313, "grad_norm": 0.337890625, "grad_norm_var": 0.007984352111816407, "learning_rate": 0.0001, "loss": 1.565, "loss/crossentropy": 2.6227328777313232, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.22903753072023392, "step": 11419 }, { "epoch": 0.17052538842308812, "grad_norm": 0.291015625, "grad_norm_var": 0.008023945490519206, "learning_rate": 0.0001, "loss": 1.3147, "loss/crossentropy": 2.5994839668273926, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.18579699099063873, "step": 11420 }, { "epoch": 0.1705403205937031, "grad_norm": 0.322265625, "grad_norm_var": 0.007982746760050455, "learning_rate": 0.0001, "loss": 1.4109, "loss/crossentropy": 2.625697374343872, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19605477899312973, "step": 11421 }, { "epoch": 0.17055525276431807, "grad_norm": 0.314453125, "grad_norm_var": 0.0078399658203125, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.7163506746292114, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18512602150440216, "step": 11422 }, { "epoch": 0.17057018493493306, "grad_norm": 0.318359375, "grad_norm_var": 0.007761065165201823, "learning_rate": 0.0001, "loss": 1.5774, "loss/crossentropy": 2.450889468193054, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.25321636348962784, "step": 11423 }, { "epoch": 0.17058511710554805, "grad_norm": 0.29296875, "grad_norm_var": 0.0076610406239827475, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.298693537712097, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18513687700033188, "step": 11424 }, { "epoch": 0.17060004927616304, "grad_norm": 0.3203125, "grad_norm_var": 0.007634719212849935, "learning_rate": 0.0001, "loss": 1.3906, "loss/crossentropy": 2.694652557373047, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1874532625079155, "step": 11425 }, { "epoch": 0.170614981446778, "grad_norm": 0.29296875, "grad_norm_var": 0.0076705296834309895, "learning_rate": 0.0001, "loss": 1.3439, "loss/crossentropy": 2.6736656427383423, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18379095196723938, "step": 11426 }, { "epoch": 0.170629913617393, "grad_norm": 0.3046875, "grad_norm_var": 0.007660659154256185, "learning_rate": 0.0001, "loss": 1.4793, "loss/crossentropy": 2.4264861345291138, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21753329038619995, "step": 11427 }, { "epoch": 0.17064484578800798, "grad_norm": 0.283203125, "grad_norm_var": 0.007820876439412434, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.5757317543029785, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19335008412599564, "step": 11428 }, { "epoch": 0.17065977795862294, "grad_norm": 0.27734375, "grad_norm_var": 0.007987213134765626, "learning_rate": 0.0001, "loss": 1.2923, "loss/crossentropy": 2.5076154470443726, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16727928072214127, "step": 11429 }, { "epoch": 0.17067471012923793, "grad_norm": 0.298828125, "grad_norm_var": 0.006485748291015625, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.545380115509033, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1920774057507515, "step": 11430 }, { "epoch": 0.17068964229985292, "grad_norm": 0.32421875, "grad_norm_var": 0.006473541259765625, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.5425806045532227, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16986211389303207, "step": 11431 }, { "epoch": 0.1707045744704679, "grad_norm": 0.296875, "grad_norm_var": 0.006508827209472656, "learning_rate": 0.0001, "loss": 1.473, "loss/crossentropy": 2.5698070526123047, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21520236134529114, "step": 11432 }, { "epoch": 0.17071950664108287, "grad_norm": 0.306640625, "grad_norm_var": 0.0002724806467692057, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.7371445894241333, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18912221491336823, "step": 11433 }, { "epoch": 0.17073443881169786, "grad_norm": 0.72265625, "grad_norm_var": 0.011149851481119792, "learning_rate": 0.0001, "loss": 1.827, "loss/crossentropy": 2.9697736501693726, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.36602257192134857, "step": 11434 }, { "epoch": 0.17074937098231285, "grad_norm": 0.302734375, "grad_norm_var": 0.011197344462076823, "learning_rate": 0.0001, "loss": 1.4105, "loss/crossentropy": 2.6084096431732178, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.21910043060779572, "step": 11435 }, { "epoch": 0.17076430315292782, "grad_norm": 0.330078125, "grad_norm_var": 0.011093076070149739, "learning_rate": 0.0001, "loss": 1.3981, "loss/crossentropy": 2.5728254318237305, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1832670122385025, "step": 11436 }, { "epoch": 0.1707792353235428, "grad_norm": 0.3359375, "grad_norm_var": 0.011087401707967123, "learning_rate": 0.0001, "loss": 1.4535, "loss/crossentropy": 2.5676788091659546, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.1956811547279358, "step": 11437 }, { "epoch": 0.1707941674941578, "grad_norm": 0.32421875, "grad_norm_var": 0.011069679260253906, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.6247317790985107, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19361281394958496, "step": 11438 }, { "epoch": 0.17080909966477276, "grad_norm": 0.365234375, "grad_norm_var": 0.011113929748535156, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.474096894264221, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.17957276850938797, "step": 11439 }, { "epoch": 0.17082403183538775, "grad_norm": 0.330078125, "grad_norm_var": 0.01098618507385254, "learning_rate": 0.0001, "loss": 1.4868, "loss/crossentropy": 2.442362427711487, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20550759881734848, "step": 11440 }, { "epoch": 0.17083896400600274, "grad_norm": 0.30859375, "grad_norm_var": 0.011023187637329101, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.6810073852539062, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18152600526809692, "step": 11441 }, { "epoch": 0.17085389617661773, "grad_norm": 0.287109375, "grad_norm_var": 0.011060333251953125, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.411795496940613, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.19394323974847794, "step": 11442 }, { "epoch": 0.1708688283472327, "grad_norm": 0.30859375, "grad_norm_var": 0.01104424794514974, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.699841260910034, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18476475030183792, "step": 11443 }, { "epoch": 0.17088376051784768, "grad_norm": 0.271484375, "grad_norm_var": 0.011137898763020833, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.5629597902297974, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18488653004169464, "step": 11444 }, { "epoch": 0.17089869268846267, "grad_norm": 0.2734375, "grad_norm_var": 0.011169878641764323, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.4483816623687744, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1775568127632141, "step": 11445 }, { "epoch": 0.17091362485907763, "grad_norm": 0.294921875, "grad_norm_var": 0.011190541585286458, "learning_rate": 0.0001, "loss": 1.3696, "loss/crossentropy": 2.603829264640808, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18205805122852325, "step": 11446 }, { "epoch": 0.17092855702969262, "grad_norm": 0.27734375, "grad_norm_var": 0.011404164632161458, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.488764762878418, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21436722576618195, "step": 11447 }, { "epoch": 0.1709434892003076, "grad_norm": 0.328125, "grad_norm_var": 0.011312611897786458, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.6802737712860107, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21600010991096497, "step": 11448 }, { "epoch": 0.1709584213709226, "grad_norm": 0.287109375, "grad_norm_var": 0.011411476135253906, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.7110743522644043, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17903853207826614, "step": 11449 }, { "epoch": 0.17097335354153756, "grad_norm": 0.28125, "grad_norm_var": 0.0007283528645833333, "learning_rate": 0.0001, "loss": 1.2872, "loss/crossentropy": 2.6525689363479614, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16608726978302002, "step": 11450 }, { "epoch": 0.17098828571215255, "grad_norm": 0.296875, "grad_norm_var": 0.0007335503896077473, "learning_rate": 0.0001, "loss": 1.4599, "loss/crossentropy": 2.7647546529769897, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2098737582564354, "step": 11451 }, { "epoch": 0.17100321788276754, "grad_norm": 0.357421875, "grad_norm_var": 0.0008670647939046223, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 2.599672794342041, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.24024389684200287, "step": 11452 }, { "epoch": 0.1710181500533825, "grad_norm": 0.32421875, "grad_norm_var": 0.0008319695790608724, "learning_rate": 0.0001, "loss": 1.5442, "loss/crossentropy": 2.2559926509857178, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.20044732093811035, "step": 11453 }, { "epoch": 0.1710330822239975, "grad_norm": 0.26953125, "grad_norm_var": 0.0008951663970947265, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.545352816581726, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1756312996149063, "step": 11454 }, { "epoch": 0.17104801439461248, "grad_norm": 0.287109375, "grad_norm_var": 0.0006370385487874349, "learning_rate": 0.0001, "loss": 1.3289, "loss/crossentropy": 2.704787492752075, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18045921623706818, "step": 11455 }, { "epoch": 0.17106294656522744, "grad_norm": 0.322265625, "grad_norm_var": 0.0006084283192952474, "learning_rate": 0.0001, "loss": 1.3754, "loss/crossentropy": 2.7882325649261475, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.17619314789772034, "step": 11456 }, { "epoch": 0.17107787873584243, "grad_norm": 0.3203125, "grad_norm_var": 0.0006328423817952474, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.703981041908264, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19942091405391693, "step": 11457 }, { "epoch": 0.17109281090645742, "grad_norm": 0.36328125, "grad_norm_var": 0.0008727391560872396, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.7250508069992065, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20294660329818726, "step": 11458 }, { "epoch": 0.1711077430770724, "grad_norm": 0.34765625, "grad_norm_var": 0.000992266337076823, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 3.0007355213165283, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.25999075174331665, "step": 11459 }, { "epoch": 0.17112267524768737, "grad_norm": 0.408203125, "grad_norm_var": 0.0015240987141927084, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.8894237279891968, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.24032365530729294, "step": 11460 }, { "epoch": 0.17113760741830236, "grad_norm": 0.353515625, "grad_norm_var": 0.0014817396799723308, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.618181586265564, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.24056699872016907, "step": 11461 }, { "epoch": 0.17115253958891735, "grad_norm": 0.2890625, "grad_norm_var": 0.001503435770670573, "learning_rate": 0.0001, "loss": 1.4288, "loss/crossentropy": 2.5318500995635986, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20222150534391403, "step": 11462 }, { "epoch": 0.1711674717595323, "grad_norm": 0.294921875, "grad_norm_var": 0.0014237562815348308, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.512843370437622, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17329660803079605, "step": 11463 }, { "epoch": 0.1711824039301473, "grad_norm": 0.287109375, "grad_norm_var": 0.001488176981608073, "learning_rate": 0.0001, "loss": 1.4163, "loss/crossentropy": 2.49695086479187, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19366388022899628, "step": 11464 }, { "epoch": 0.1711973361007623, "grad_norm": 0.2890625, "grad_norm_var": 0.0014803409576416016, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.6975340843200684, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22373227775096893, "step": 11465 }, { "epoch": 0.17121226827137728, "grad_norm": 0.287109375, "grad_norm_var": 0.0014535903930664063, "learning_rate": 0.0001, "loss": 1.4401, "loss/crossentropy": 2.7202510833740234, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2135426253080368, "step": 11466 }, { "epoch": 0.17122720044199224, "grad_norm": 0.25390625, "grad_norm_var": 0.0016934712727864583, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.38164222240448, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.15626810491085052, "step": 11467 }, { "epoch": 0.17124213261260723, "grad_norm": 0.396484375, "grad_norm_var": 0.0020050048828125, "learning_rate": 0.0001, "loss": 1.598, "loss/crossentropy": 2.559120774269104, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.24253728985786438, "step": 11468 }, { "epoch": 0.17125706478322222, "grad_norm": 0.322265625, "grad_norm_var": 0.0020037174224853517, "learning_rate": 0.0001, "loss": 1.3166, "loss/crossentropy": 2.6458749771118164, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16423668712377548, "step": 11469 }, { "epoch": 0.17127199695383719, "grad_norm": 0.30078125, "grad_norm_var": 0.0018618106842041016, "learning_rate": 0.0001, "loss": 1.407, "loss/crossentropy": 2.6353724002838135, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.21167605370283127, "step": 11470 }, { "epoch": 0.17128692912445218, "grad_norm": 0.37109375, "grad_norm_var": 0.0019322077433268229, "learning_rate": 0.0001, "loss": 1.5663, "loss/crossentropy": 2.822852849960327, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.26939406991004944, "step": 11471 }, { "epoch": 0.17130186129506716, "grad_norm": 0.32421875, "grad_norm_var": 0.001931619644165039, "learning_rate": 0.0001, "loss": 1.582, "loss/crossentropy": 2.5634909868240356, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2460583820939064, "step": 11472 }, { "epoch": 0.17131679346568213, "grad_norm": 0.28515625, "grad_norm_var": 0.0020334720611572266, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.7011290788650513, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1720791608095169, "step": 11473 }, { "epoch": 0.17133172563629712, "grad_norm": 0.26953125, "grad_norm_var": 0.0020838260650634767, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.6528934240341187, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17710978537797928, "step": 11474 }, { "epoch": 0.1713466578069121, "grad_norm": 0.287109375, "grad_norm_var": 0.002069536844889323, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.620209574699402, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19202902913093567, "step": 11475 }, { "epoch": 0.1713615899775271, "grad_norm": 0.306640625, "grad_norm_var": 0.001434771219889323, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.6251755952835083, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18849578499794006, "step": 11476 }, { "epoch": 0.17137652214814206, "grad_norm": 0.310546875, "grad_norm_var": 0.0012858072916666667, "learning_rate": 0.0001, "loss": 1.4884, "loss/crossentropy": 2.437756061553955, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21498196572065353, "step": 11477 }, { "epoch": 0.17139145431875705, "grad_norm": 0.283203125, "grad_norm_var": 0.0013001600901285807, "learning_rate": 0.0001, "loss": 1.2717, "loss/crossentropy": 2.6287044286727905, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.1623176708817482, "step": 11478 }, { "epoch": 0.17140638648937204, "grad_norm": 0.31640625, "grad_norm_var": 0.0013020833333333333, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.603882312774658, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2119198739528656, "step": 11479 }, { "epoch": 0.171421318659987, "grad_norm": 0.3203125, "grad_norm_var": 0.0012888431549072266, "learning_rate": 0.0001, "loss": 1.4413, "loss/crossentropy": 2.5692315101623535, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19525369256734848, "step": 11480 }, { "epoch": 0.171436250830602, "grad_norm": 0.30078125, "grad_norm_var": 0.0012682437896728515, "learning_rate": 0.0001, "loss": 1.3031, "loss/crossentropy": 2.5528063774108887, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1702720746397972, "step": 11481 }, { "epoch": 0.17145118300121698, "grad_norm": 0.2890625, "grad_norm_var": 0.0012629191080729167, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.4778934717178345, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20100437849760056, "step": 11482 }, { "epoch": 0.17146611517183194, "grad_norm": 0.328125, "grad_norm_var": 0.0010660171508789062, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.5050312280654907, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18896639347076416, "step": 11483 }, { "epoch": 0.17148104734244693, "grad_norm": 0.35546875, "grad_norm_var": 0.0007158756256103515, "learning_rate": 0.0001, "loss": 1.5592, "loss/crossentropy": 2.464094400405884, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23106862604618073, "step": 11484 }, { "epoch": 0.17149597951306192, "grad_norm": 0.28125, "grad_norm_var": 0.000757598876953125, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.4819164276123047, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.18691180646419525, "step": 11485 }, { "epoch": 0.1715109116836769, "grad_norm": 0.3671875, "grad_norm_var": 0.0009683609008789062, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.6367024183273315, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2361765280365944, "step": 11486 }, { "epoch": 0.17152584385429187, "grad_norm": 0.345703125, "grad_norm_var": 0.0008094628651936849, "learning_rate": 0.0001, "loss": 1.3685, "loss/crossentropy": 2.8917962312698364, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18488257378339767, "step": 11487 }, { "epoch": 0.17154077602490686, "grad_norm": 0.3125, "grad_norm_var": 0.0007968743642171224, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.6872180700302124, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2167365401983261, "step": 11488 }, { "epoch": 0.17155570819552185, "grad_norm": 0.302734375, "grad_norm_var": 0.0007581075032552083, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.6360121965408325, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20349958539009094, "step": 11489 }, { "epoch": 0.1715706403661368, "grad_norm": 0.3203125, "grad_norm_var": 0.0006382624308268229, "learning_rate": 0.0001, "loss": 1.4826, "loss/crossentropy": 2.7049251794815063, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20913171768188477, "step": 11490 }, { "epoch": 0.1715855725367518, "grad_norm": 0.310546875, "grad_norm_var": 0.0005879084269205729, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.594847798347473, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17114698886871338, "step": 11491 }, { "epoch": 0.1716005047073668, "grad_norm": 0.296875, "grad_norm_var": 0.000605630874633789, "learning_rate": 0.0001, "loss": 1.56, "loss/crossentropy": 2.671660542488098, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.26310937106609344, "step": 11492 }, { "epoch": 0.17161543687798178, "grad_norm": 0.333984375, "grad_norm_var": 0.0006258487701416016, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.704321265220642, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1938021406531334, "step": 11493 }, { "epoch": 0.17163036904859674, "grad_norm": 0.3046875, "grad_norm_var": 0.000559234619140625, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.8371998071670532, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21659843623638153, "step": 11494 }, { "epoch": 0.17164530121921173, "grad_norm": 0.56640625, "grad_norm_var": 0.004416656494140625, "learning_rate": 0.0001, "loss": 1.6827, "loss/crossentropy": 2.9533597230911255, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.23742318898439407, "step": 11495 }, { "epoch": 0.17166023338982672, "grad_norm": 0.373046875, "grad_norm_var": 0.0044977664947509766, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.4358749389648438, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2246789187192917, "step": 11496 }, { "epoch": 0.17167516556044168, "grad_norm": 0.41796875, "grad_norm_var": 0.004793405532836914, "learning_rate": 0.0001, "loss": 1.7879, "loss/crossentropy": 2.4931763410568237, "loss/fcd": 1.48828125, "loss/idx": 11.0, "loss/logits": 0.2995786666870117, "step": 11497 }, { "epoch": 0.17169009773105667, "grad_norm": 0.4375, "grad_norm_var": 0.005080906550089518, "learning_rate": 0.0001, "loss": 1.7315, "loss/crossentropy": 2.637446165084839, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.29402367770671844, "step": 11498 }, { "epoch": 0.17170502990167166, "grad_norm": 0.29296875, "grad_norm_var": 0.005276600519816081, "learning_rate": 0.0001, "loss": 1.2881, "loss/crossentropy": 2.4992605447769165, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16702621430158615, "step": 11499 }, { "epoch": 0.17171996207228662, "grad_norm": 0.314453125, "grad_norm_var": 0.005358378092447917, "learning_rate": 0.0001, "loss": 1.5674, "loss/crossentropy": 2.4475815296173096, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.23539189994335175, "step": 11500 }, { "epoch": 0.17173489424290161, "grad_norm": 0.32421875, "grad_norm_var": 0.005087725321451823, "learning_rate": 0.0001, "loss": 1.544, "loss/crossentropy": 2.6575225591659546, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2353733628988266, "step": 11501 }, { "epoch": 0.1717498264135166, "grad_norm": 0.31640625, "grad_norm_var": 0.005141448974609375, "learning_rate": 0.0001, "loss": 1.483, "loss/crossentropy": 2.4820570945739746, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22126533091068268, "step": 11502 }, { "epoch": 0.1717647585841316, "grad_norm": 0.3203125, "grad_norm_var": 0.005190006891886393, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.583265781402588, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19407760351896286, "step": 11503 }, { "epoch": 0.17177969075474656, "grad_norm": 0.287109375, "grad_norm_var": 0.005345598856608073, "learning_rate": 0.0001, "loss": 1.2654, "loss/crossentropy": 2.6147940158843994, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1599022001028061, "step": 11504 }, { "epoch": 0.17179462292536155, "grad_norm": 0.34765625, "grad_norm_var": 0.0052187442779541016, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.8495486974716187, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21760417520999908, "step": 11505 }, { "epoch": 0.17180955509597653, "grad_norm": 0.291015625, "grad_norm_var": 0.005379676818847656, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.522940993309021, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1768389642238617, "step": 11506 }, { "epoch": 0.1718244872665915, "grad_norm": 0.291015625, "grad_norm_var": 0.005495707194010417, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.6034576892852783, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19334358721971512, "step": 11507 }, { "epoch": 0.1718394194372065, "grad_norm": 0.283203125, "grad_norm_var": 0.005594619115193685, "learning_rate": 0.0001, "loss": 1.4161, "loss/crossentropy": 2.739803671836853, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20123273879289627, "step": 11508 }, { "epoch": 0.17185435160782148, "grad_norm": 0.3046875, "grad_norm_var": 0.0056868871053059895, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.608900785446167, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.23044906556606293, "step": 11509 }, { "epoch": 0.17186928377843647, "grad_norm": 0.28515625, "grad_norm_var": 0.005808003743489583, "learning_rate": 0.0001, "loss": 1.3413, "loss/crossentropy": 2.93966007232666, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18115176260471344, "step": 11510 }, { "epoch": 0.17188421594905143, "grad_norm": 0.27734375, "grad_norm_var": 0.0023358662923177085, "learning_rate": 0.0001, "loss": 1.3817, "loss/crossentropy": 2.6686917543411255, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.20204906910657883, "step": 11511 }, { "epoch": 0.17189914811966642, "grad_norm": 0.36328125, "grad_norm_var": 0.002276341120402018, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.533328652381897, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.19550155848264694, "step": 11512 }, { "epoch": 0.1719140802902814, "grad_norm": 0.28515625, "grad_norm_var": 0.0016818841298421223, "learning_rate": 0.0001, "loss": 1.4044, "loss/crossentropy": 2.634984254837036, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20124110579490662, "step": 11513 }, { "epoch": 0.17192901246089637, "grad_norm": 0.275390625, "grad_norm_var": 0.00065155029296875, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.3747215270996094, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21151579171419144, "step": 11514 }, { "epoch": 0.17194394463151136, "grad_norm": 0.326171875, "grad_norm_var": 0.0006728967030843099, "learning_rate": 0.0001, "loss": 1.4477, "loss/crossentropy": 2.691500186920166, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21328217536211014, "step": 11515 }, { "epoch": 0.17195887680212635, "grad_norm": 0.2890625, "grad_norm_var": 0.0006838480631510417, "learning_rate": 0.0001, "loss": 1.3223, "loss/crossentropy": 2.654765009880066, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17387376725673676, "step": 11516 }, { "epoch": 0.1719738089727413, "grad_norm": 0.298828125, "grad_norm_var": 0.0006563663482666016, "learning_rate": 0.0001, "loss": 1.4082, "loss/crossentropy": 2.5357658863067627, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19340397417545319, "step": 11517 }, { "epoch": 0.1719887411433563, "grad_norm": 0.2890625, "grad_norm_var": 0.0006528059641520182, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.634191632270813, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19324766844511032, "step": 11518 }, { "epoch": 0.1720036733139713, "grad_norm": 0.296875, "grad_norm_var": 0.0006264845530192057, "learning_rate": 0.0001, "loss": 1.3625, "loss/crossentropy": 2.496102213859558, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17504291981458664, "step": 11519 }, { "epoch": 0.17201860548458628, "grad_norm": 0.341796875, "grad_norm_var": 0.0007235050201416015, "learning_rate": 0.0001, "loss": 1.2906, "loss/crossentropy": 2.521006941795349, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1656305193901062, "step": 11520 }, { "epoch": 0.17203353765520124, "grad_norm": 0.474609375, "grad_norm_var": 0.002489153544108073, "learning_rate": 0.0001, "loss": 1.9359, "loss/crossentropy": 2.5260441303253174, "loss/fcd": 1.62109375, "loss/idx": 11.0, "loss/logits": 0.3147933781147003, "step": 11521 }, { "epoch": 0.17204846982581623, "grad_norm": 0.306640625, "grad_norm_var": 0.002463213602701823, "learning_rate": 0.0001, "loss": 1.4504, "loss/crossentropy": 2.4696604013442993, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20041070878505707, "step": 11522 }, { "epoch": 0.17206340199643122, "grad_norm": 0.314453125, "grad_norm_var": 0.002432696024576823, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 2.4938825368881226, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20665711164474487, "step": 11523 }, { "epoch": 0.17207833416704618, "grad_norm": 0.31640625, "grad_norm_var": 0.0023686567942301433, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.6433461904525757, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21708300709724426, "step": 11524 }, { "epoch": 0.17209326633766117, "grad_norm": 0.326171875, "grad_norm_var": 0.002367083231608073, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.441357135772705, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2514876648783684, "step": 11525 }, { "epoch": 0.17210819850827616, "grad_norm": 0.294921875, "grad_norm_var": 0.002332035700480143, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.587491750717163, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20449227839708328, "step": 11526 }, { "epoch": 0.17212313067889115, "grad_norm": 0.26953125, "grad_norm_var": 0.0023774305979410806, "learning_rate": 0.0001, "loss": 1.3515, "loss/crossentropy": 2.4536913633346558, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18739686906337738, "step": 11527 }, { "epoch": 0.1721380628495061, "grad_norm": 0.3359375, "grad_norm_var": 0.002254597345987956, "learning_rate": 0.0001, "loss": 1.4535, "loss/crossentropy": 2.7492786645889282, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21130307018756866, "step": 11528 }, { "epoch": 0.1721529950201211, "grad_norm": 0.30078125, "grad_norm_var": 0.0022075494130452474, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.534849762916565, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2310468852519989, "step": 11529 }, { "epoch": 0.1721679271907361, "grad_norm": 0.275390625, "grad_norm_var": 0.0022075494130452474, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.564942240715027, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18778619170188904, "step": 11530 }, { "epoch": 0.17218285936135105, "grad_norm": 0.34375, "grad_norm_var": 0.0022506078084309896, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.6683789491653442, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19870364665985107, "step": 11531 }, { "epoch": 0.17219779153196604, "grad_norm": 0.328125, "grad_norm_var": 0.002199745178222656, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.6496328115463257, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22352279722690582, "step": 11532 }, { "epoch": 0.17221272370258103, "grad_norm": 0.26953125, "grad_norm_var": 0.002334451675415039, "learning_rate": 0.0001, "loss": 1.392, "loss/crossentropy": 2.5903472900390625, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20451155304908752, "step": 11533 }, { "epoch": 0.172227655873196, "grad_norm": 0.318359375, "grad_norm_var": 0.002276039123535156, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.702428698539734, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20017379522323608, "step": 11534 }, { "epoch": 0.17224258804381098, "grad_norm": 0.322265625, "grad_norm_var": 0.0022394657135009766, "learning_rate": 0.0001, "loss": 1.5793, "loss/crossentropy": 2.6122188568115234, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.23550747334957123, "step": 11535 }, { "epoch": 0.17225752021442597, "grad_norm": 0.34765625, "grad_norm_var": 0.0022577285766601563, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.811245918273926, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19254463911056519, "step": 11536 }, { "epoch": 0.17227245238504096, "grad_norm": 0.263671875, "grad_norm_var": 0.0007333755493164062, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.6171181201934814, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.16913577169179916, "step": 11537 }, { "epoch": 0.17228738455565593, "grad_norm": 0.30078125, "grad_norm_var": 0.000736856460571289, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.6756848096847534, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2171304002404213, "step": 11538 }, { "epoch": 0.17230231672627092, "grad_norm": 0.2734375, "grad_norm_var": 0.0008066177368164062, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.4616386890411377, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20220057666301727, "step": 11539 }, { "epoch": 0.1723172488968859, "grad_norm": 0.298828125, "grad_norm_var": 0.000800180435180664, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.808699607849121, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17563217133283615, "step": 11540 }, { "epoch": 0.17233218106750087, "grad_norm": 0.2890625, "grad_norm_var": 0.0007781346638997395, "learning_rate": 0.0001, "loss": 1.3733, "loss/crossentropy": 2.6751697063446045, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19361189007759094, "step": 11541 }, { "epoch": 0.17234711323811586, "grad_norm": 0.41796875, "grad_norm_var": 0.00160826047261556, "learning_rate": 0.0001, "loss": 1.7968, "loss/crossentropy": 2.71407413482666, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.292939268052578, "step": 11542 }, { "epoch": 0.17236204540873085, "grad_norm": 0.291015625, "grad_norm_var": 0.001522064208984375, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.707728624343872, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19918638467788696, "step": 11543 }, { "epoch": 0.1723769775793458, "grad_norm": 0.298828125, "grad_norm_var": 0.0014849185943603515, "learning_rate": 0.0001, "loss": 1.6426, "loss/crossentropy": 2.5052911043167114, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.27147845923900604, "step": 11544 }, { "epoch": 0.1723919097499608, "grad_norm": 0.3203125, "grad_norm_var": 0.0014880975087483725, "learning_rate": 0.0001, "loss": 1.4454, "loss/crossentropy": 2.586636185646057, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21104547381401062, "step": 11545 }, { "epoch": 0.1724068419205758, "grad_norm": 0.361328125, "grad_norm_var": 0.0015538374582926432, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.3291449546813965, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.18689075112342834, "step": 11546 }, { "epoch": 0.17242177409119078, "grad_norm": 0.294921875, "grad_norm_var": 0.0015176773071289063, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.6008877754211426, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20743153244256973, "step": 11547 }, { "epoch": 0.17243670626180574, "grad_norm": 0.30859375, "grad_norm_var": 0.0015001932779947916, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.7042269706726074, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1872618943452835, "step": 11548 }, { "epoch": 0.17245163843242073, "grad_norm": 0.388671875, "grad_norm_var": 0.001728041966756185, "learning_rate": 0.0001, "loss": 1.6562, "loss/crossentropy": 2.4979037046432495, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.24213427305221558, "step": 11549 }, { "epoch": 0.17246657060303572, "grad_norm": 0.2890625, "grad_norm_var": 0.0017821629842122396, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.5820655822753906, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20471186935901642, "step": 11550 }, { "epoch": 0.17248150277365068, "grad_norm": 0.29296875, "grad_norm_var": 0.0018138726552327474, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.5772491693496704, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21137920767068863, "step": 11551 }, { "epoch": 0.17249643494426567, "grad_norm": 0.361328125, "grad_norm_var": 0.0018854141235351562, "learning_rate": 0.0001, "loss": 1.5951, "loss/crossentropy": 2.4200884103775024, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.21226414293050766, "step": 11552 }, { "epoch": 0.17251136711488066, "grad_norm": 0.32421875, "grad_norm_var": 0.0016947269439697265, "learning_rate": 0.0001, "loss": 1.5239, "loss/crossentropy": 2.600649356842041, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22312137484550476, "step": 11553 }, { "epoch": 0.17252629928549565, "grad_norm": 0.306640625, "grad_norm_var": 0.001682281494140625, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.6949962377548218, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16244785487651825, "step": 11554 }, { "epoch": 0.1725412314561106, "grad_norm": 0.384765625, "grad_norm_var": 0.001768350601196289, "learning_rate": 0.0001, "loss": 1.6845, "loss/crossentropy": 2.511651039123535, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.27823713421821594, "step": 11555 }, { "epoch": 0.1725561636267256, "grad_norm": 0.3125, "grad_norm_var": 0.0017290751139322916, "learning_rate": 0.0001, "loss": 1.3975, "loss/crossentropy": 2.590782880783081, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19825421273708344, "step": 11556 }, { "epoch": 0.1725710957973406, "grad_norm": 0.302734375, "grad_norm_var": 0.0016704400380452473, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.589898943901062, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2058260813355446, "step": 11557 }, { "epoch": 0.17258602796795555, "grad_norm": 0.291015625, "grad_norm_var": 0.001163164774576823, "learning_rate": 0.0001, "loss": 1.3811, "loss/crossentropy": 2.6190463304519653, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18190553784370422, "step": 11558 }, { "epoch": 0.17260096013857054, "grad_norm": 0.296875, "grad_norm_var": 0.0011422316233317058, "learning_rate": 0.0001, "loss": 1.5138, "loss/crossentropy": 2.718702554702759, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22474205493927002, "step": 11559 }, { "epoch": 0.17261589230918553, "grad_norm": 0.263671875, "grad_norm_var": 0.0013230482737223307, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.5627065896987915, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.1606379598379135, "step": 11560 }, { "epoch": 0.1726308244798005, "grad_norm": 0.330078125, "grad_norm_var": 0.0013310750325520833, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.3719231486320496, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.27649756520986557, "step": 11561 }, { "epoch": 0.17264575665041548, "grad_norm": 0.45703125, "grad_norm_var": 0.0024393558502197265, "learning_rate": 0.0001, "loss": 1.4085, "loss/crossentropy": 2.4460079669952393, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20535263419151306, "step": 11562 }, { "epoch": 0.17266068882103047, "grad_norm": 0.400390625, "grad_norm_var": 0.0027071475982666016, "learning_rate": 0.0001, "loss": 1.6727, "loss/crossentropy": 2.5265727043151855, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.26645730435848236, "step": 11563 }, { "epoch": 0.17267562099164546, "grad_norm": 0.33984375, "grad_norm_var": 0.002671035130818685, "learning_rate": 0.0001, "loss": 1.2399, "loss/crossentropy": 2.7936278581619263, "loss/fcd": 1.078125, "loss/idx": 11.0, "loss/logits": 0.16174796968698502, "step": 11564 }, { "epoch": 0.17269055316226042, "grad_norm": 0.298828125, "grad_norm_var": 0.002518955866495768, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.544697880744934, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18205173313617706, "step": 11565 }, { "epoch": 0.1727054853328754, "grad_norm": 0.302734375, "grad_norm_var": 0.0024592081705729165, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.4956737756729126, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20051663368940353, "step": 11566 }, { "epoch": 0.1727204175034904, "grad_norm": 0.283203125, "grad_norm_var": 0.002512216567993164, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.6325151920318604, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20307154953479767, "step": 11567 }, { "epoch": 0.17273534967410537, "grad_norm": 0.26953125, "grad_norm_var": 0.0026369730631510416, "learning_rate": 0.0001, "loss": 1.3814, "loss/crossentropy": 2.6850597858428955, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18994849175214767, "step": 11568 }, { "epoch": 0.17275028184472035, "grad_norm": 0.267578125, "grad_norm_var": 0.0028264204661051433, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.473757743835449, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17471147328615189, "step": 11569 }, { "epoch": 0.17276521401533534, "grad_norm": 0.349609375, "grad_norm_var": 0.0028697808583577475, "learning_rate": 0.0001, "loss": 1.6394, "loss/crossentropy": 2.597162961959839, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.248756542801857, "step": 11570 }, { "epoch": 0.17278014618595033, "grad_norm": 0.263671875, "grad_norm_var": 0.0027712345123291015, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.4673022031784058, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1544589027762413, "step": 11571 }, { "epoch": 0.1727950783565653, "grad_norm": 0.30859375, "grad_norm_var": 0.002773141860961914, "learning_rate": 0.0001, "loss": 1.55, "loss/crossentropy": 2.644390106201172, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.24138978868722916, "step": 11572 }, { "epoch": 0.17281001052718029, "grad_norm": 0.28515625, "grad_norm_var": 0.002819061279296875, "learning_rate": 0.0001, "loss": 1.4649, "loss/crossentropy": 2.60509192943573, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20313532650470734, "step": 11573 }, { "epoch": 0.17282494269779528, "grad_norm": 0.333984375, "grad_norm_var": 0.0028085708618164062, "learning_rate": 0.0001, "loss": 1.5887, "loss/crossentropy": 2.6739614009857178, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.229359470307827, "step": 11574 }, { "epoch": 0.17283987486841024, "grad_norm": 0.341796875, "grad_norm_var": 0.002822097142537435, "learning_rate": 0.0001, "loss": 1.6006, "loss/crossentropy": 2.44343638420105, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2451677843928337, "step": 11575 }, { "epoch": 0.17285480703902523, "grad_norm": 0.314453125, "grad_norm_var": 0.0026121616363525392, "learning_rate": 0.0001, "loss": 1.5199, "loss/crossentropy": 2.5973353385925293, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22697757184505463, "step": 11576 }, { "epoch": 0.17286973920964022, "grad_norm": 0.2890625, "grad_norm_var": 0.0026712417602539062, "learning_rate": 0.0001, "loss": 1.3407, "loss/crossentropy": 2.4663846492767334, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17270126193761826, "step": 11577 }, { "epoch": 0.17288467138025518, "grad_norm": 0.28515625, "grad_norm_var": 0.0013564427693684895, "learning_rate": 0.0001, "loss": 1.3429, "loss/crossentropy": 2.500986337661743, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17497795820236206, "step": 11578 }, { "epoch": 0.17289960355087017, "grad_norm": 0.275390625, "grad_norm_var": 0.0007989883422851562, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.2310080528259277, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.16787400841712952, "step": 11579 }, { "epoch": 0.17291453572148516, "grad_norm": 0.287109375, "grad_norm_var": 0.0006964206695556641, "learning_rate": 0.0001, "loss": 1.4186, "loss/crossentropy": 2.4575157165527344, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1764577180147171, "step": 11580 }, { "epoch": 0.17292946789210015, "grad_norm": 0.279296875, "grad_norm_var": 0.0007161299387613932, "learning_rate": 0.0001, "loss": 1.3829, "loss/crossentropy": 2.530377507209778, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19930797815322876, "step": 11581 }, { "epoch": 0.1729444000627151, "grad_norm": 0.310546875, "grad_norm_var": 0.0007269382476806641, "learning_rate": 0.0001, "loss": 1.502, "loss/crossentropy": 2.6176677942276, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2324669435620308, "step": 11582 }, { "epoch": 0.1729593322333301, "grad_norm": 0.400390625, "grad_norm_var": 0.0013773441314697266, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.5389244556427, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21711425483226776, "step": 11583 }, { "epoch": 0.1729742644039451, "grad_norm": 0.271484375, "grad_norm_var": 0.0013686498006184897, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.4907807111740112, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2132926806807518, "step": 11584 }, { "epoch": 0.17298919657456005, "grad_norm": 0.294921875, "grad_norm_var": 0.0012827555338541666, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.5895663499832153, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1912703663110733, "step": 11585 }, { "epoch": 0.17300412874517504, "grad_norm": 0.318359375, "grad_norm_var": 0.0011606852213541666, "learning_rate": 0.0001, "loss": 1.4545, "loss/crossentropy": 2.4793046712875366, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.200596384704113, "step": 11586 }, { "epoch": 0.17301906091579003, "grad_norm": 0.310546875, "grad_norm_var": 0.0010477701822916667, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.6228431463241577, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17870883643627167, "step": 11587 }, { "epoch": 0.17303399308640502, "grad_norm": 0.27734375, "grad_norm_var": 0.0011006673177083334, "learning_rate": 0.0001, "loss": 1.2662, "loss/crossentropy": 2.600632429122925, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1685854196548462, "step": 11588 }, { "epoch": 0.17304892525701998, "grad_norm": 0.439453125, "grad_norm_var": 0.0021868228912353517, "learning_rate": 0.0001, "loss": 1.7158, "loss/crossentropy": 2.433752417564392, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.25876379758119583, "step": 11589 }, { "epoch": 0.17306385742763497, "grad_norm": 0.330078125, "grad_norm_var": 0.0021775404612223307, "learning_rate": 0.0001, "loss": 1.3957, "loss/crossentropy": 2.6113526821136475, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1965196505188942, "step": 11590 }, { "epoch": 0.17307878959824996, "grad_norm": 0.275390625, "grad_norm_var": 0.0022078037261962892, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5847654342651367, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.16272959113121033, "step": 11591 }, { "epoch": 0.17309372176886492, "grad_norm": 0.35546875, "grad_norm_var": 0.002337646484375, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.8035629987716675, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18755442649126053, "step": 11592 }, { "epoch": 0.1731086539394799, "grad_norm": 0.27734375, "grad_norm_var": 0.002382850646972656, "learning_rate": 0.0001, "loss": 1.3788, "loss/crossentropy": 2.4028687477111816, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19913264364004135, "step": 11593 }, { "epoch": 0.1731235861100949, "grad_norm": 0.421875, "grad_norm_var": 0.0030659993489583334, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.4751715660095215, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.25805994868278503, "step": 11594 }, { "epoch": 0.17313851828070986, "grad_norm": 0.30859375, "grad_norm_var": 0.0029360294342041016, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.7889801263809204, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20515523850917816, "step": 11595 }, { "epoch": 0.17315345045132485, "grad_norm": 0.287109375, "grad_norm_var": 0.0029360294342041016, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.5478031635284424, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18298839032649994, "step": 11596 }, { "epoch": 0.17316838262193984, "grad_norm": 0.279296875, "grad_norm_var": 0.0029360294342041016, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.6928207874298096, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1774495393037796, "step": 11597 }, { "epoch": 0.17318331479255483, "grad_norm": 0.29296875, "grad_norm_var": 0.00298309326171875, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.7019357681274414, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1893789991736412, "step": 11598 }, { "epoch": 0.1731982469631698, "grad_norm": 0.302734375, "grad_norm_var": 0.0025491714477539062, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.7447279691696167, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.234285369515419, "step": 11599 }, { "epoch": 0.17321317913378478, "grad_norm": 0.29296875, "grad_norm_var": 0.0024528344472249347, "learning_rate": 0.0001, "loss": 1.487, "loss/crossentropy": 2.5875715017318726, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21746912598609924, "step": 11600 }, { "epoch": 0.17322811130439977, "grad_norm": 0.28515625, "grad_norm_var": 0.002486928304036458, "learning_rate": 0.0001, "loss": 1.3956, "loss/crossentropy": 2.4967291355133057, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2002606987953186, "step": 11601 }, { "epoch": 0.17324304347501474, "grad_norm": 0.310546875, "grad_norm_var": 0.0024881998697916666, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.316721737384796, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.23980951309204102, "step": 11602 }, { "epoch": 0.17325797564562972, "grad_norm": 0.267578125, "grad_norm_var": 0.0026315689086914063, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.5849673748016357, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19543440639972687, "step": 11603 }, { "epoch": 0.17327290781624471, "grad_norm": 0.33203125, "grad_norm_var": 0.0025603612263997394, "learning_rate": 0.0001, "loss": 1.5603, "loss/crossentropy": 2.367368221282959, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.24392634630203247, "step": 11604 }, { "epoch": 0.17328783998685968, "grad_norm": 0.306640625, "grad_norm_var": 0.0014795303344726563, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.7807623147964478, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18928468227386475, "step": 11605 }, { "epoch": 0.17330277215747467, "grad_norm": 0.283203125, "grad_norm_var": 0.0014780044555664062, "learning_rate": 0.0001, "loss": 1.3558, "loss/crossentropy": 2.4608993530273438, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1916988119482994, "step": 11606 }, { "epoch": 0.17331770432808966, "grad_norm": 0.296875, "grad_norm_var": 0.0014222304026285807, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 2.556876301765442, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20258772373199463, "step": 11607 }, { "epoch": 0.17333263649870465, "grad_norm": 0.31640625, "grad_norm_var": 0.0012613773345947266, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.890706777572632, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19723620265722275, "step": 11608 }, { "epoch": 0.1733475686693196, "grad_norm": 0.302734375, "grad_norm_var": 0.0012119928995768228, "learning_rate": 0.0001, "loss": 1.2791, "loss/crossentropy": 2.627387046813965, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16192007809877396, "step": 11609 }, { "epoch": 0.1733625008399346, "grad_norm": 0.3046875, "grad_norm_var": 0.0002506891886393229, "learning_rate": 0.0001, "loss": 1.4281, "loss/crossentropy": 2.4347561597824097, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.18595610558986664, "step": 11610 }, { "epoch": 0.1733774330105496, "grad_norm": 0.3046875, "grad_norm_var": 0.0002461751302083333, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.580990195274353, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2003510370850563, "step": 11611 }, { "epoch": 0.17339236518116455, "grad_norm": 0.296875, "grad_norm_var": 0.00023814837137858073, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.5658382177352905, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17750848084688187, "step": 11612 }, { "epoch": 0.17340729735177954, "grad_norm": 0.337890625, "grad_norm_var": 0.0003029982248942057, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.470152974128723, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18254531919956207, "step": 11613 }, { "epoch": 0.17342222952239453, "grad_norm": 0.29296875, "grad_norm_var": 0.0003029982248942057, "learning_rate": 0.0001, "loss": 1.345, "loss/crossentropy": 2.4920343160629272, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18097999691963196, "step": 11614 }, { "epoch": 0.17343716169300952, "grad_norm": 0.32421875, "grad_norm_var": 0.00033359527587890626, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.4492686986923218, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18098115921020508, "step": 11615 }, { "epoch": 0.17345209386362448, "grad_norm": 0.328125, "grad_norm_var": 0.00036163330078125, "learning_rate": 0.0001, "loss": 1.716, "loss/crossentropy": 2.456624746322632, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.29805582016706467, "step": 11616 }, { "epoch": 0.17346702603423947, "grad_norm": 0.34375, "grad_norm_var": 0.00041599273681640624, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.3816471099853516, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.206218920648098, "step": 11617 }, { "epoch": 0.17348195820485446, "grad_norm": 0.361328125, "grad_norm_var": 0.0005854288736979167, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.8235899209976196, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21867044270038605, "step": 11618 }, { "epoch": 0.17349689037546942, "grad_norm": 0.3515625, "grad_norm_var": 0.000523233413696289, "learning_rate": 0.0001, "loss": 1.6086, "loss/crossentropy": 2.840221405029297, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2570713385939598, "step": 11619 }, { "epoch": 0.1735118225460844, "grad_norm": 0.341796875, "grad_norm_var": 0.00054779052734375, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.7349696159362793, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.23475579917430878, "step": 11620 }, { "epoch": 0.1735267547166994, "grad_norm": 0.30078125, "grad_norm_var": 0.0005590915679931641, "learning_rate": 0.0001, "loss": 1.3092, "loss/crossentropy": 2.523556709289551, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16467004269361496, "step": 11621 }, { "epoch": 0.17354168688731436, "grad_norm": 0.3359375, "grad_norm_var": 0.00048828125, "learning_rate": 0.0001, "loss": 1.3442, "loss/crossentropy": 2.561071038246155, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.16845616698265076, "step": 11622 }, { "epoch": 0.17355661905792935, "grad_norm": 0.330078125, "grad_norm_var": 0.00044910113016764325, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.583972454071045, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1945858895778656, "step": 11623 }, { "epoch": 0.17357155122854434, "grad_norm": 0.30078125, "grad_norm_var": 0.00047885576883951824, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.5607683658599854, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1684366911649704, "step": 11624 }, { "epoch": 0.17358648339915933, "grad_norm": 0.296875, "grad_norm_var": 0.0004963556925455729, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.4512239694595337, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2164485901594162, "step": 11625 }, { "epoch": 0.1736014155697743, "grad_norm": 0.298828125, "grad_norm_var": 0.0005120436350504557, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.520844578742981, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19271878898143768, "step": 11626 }, { "epoch": 0.17361634774038928, "grad_norm": 0.322265625, "grad_norm_var": 0.0004915873209635417, "learning_rate": 0.0001, "loss": 1.5084, "loss/crossentropy": 2.520939826965332, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.1880619376897812, "step": 11627 }, { "epoch": 0.17363127991100427, "grad_norm": 0.302734375, "grad_norm_var": 0.0004735151926676432, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.662414789199829, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21760593354701996, "step": 11628 }, { "epoch": 0.17364621208161923, "grad_norm": 0.3125, "grad_norm_var": 0.00046380360921223957, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.755169153213501, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22881924360990524, "step": 11629 }, { "epoch": 0.17366114425223422, "grad_norm": 0.298828125, "grad_norm_var": 0.0004436333974202474, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.6193944215774536, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21791422367095947, "step": 11630 }, { "epoch": 0.1736760764228492, "grad_norm": 0.298828125, "grad_norm_var": 0.00047607421875, "learning_rate": 0.0001, "loss": 1.3361, "loss/crossentropy": 2.632899045944214, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17984112352132797, "step": 11631 }, { "epoch": 0.1736910085934642, "grad_norm": 0.259765625, "grad_norm_var": 0.0006969292958577474, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.676375150680542, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.15430790930986404, "step": 11632 }, { "epoch": 0.17370594076407916, "grad_norm": 0.298828125, "grad_norm_var": 0.0006570816040039062, "learning_rate": 0.0001, "loss": 1.6409, "loss/crossentropy": 2.4850759506225586, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.26199769973754883, "step": 11633 }, { "epoch": 0.17372087293469415, "grad_norm": 0.34375, "grad_norm_var": 0.000563669204711914, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.6986966133117676, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19916826486587524, "step": 11634 }, { "epoch": 0.17373580510530914, "grad_norm": 0.30078125, "grad_norm_var": 0.00045787493387858074, "learning_rate": 0.0001, "loss": 1.4758, "loss/crossentropy": 2.589663505554199, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21014781296253204, "step": 11635 }, { "epoch": 0.1737507372759241, "grad_norm": 0.2470703125, "grad_norm_var": 0.0006039579709370931, "learning_rate": 0.0001, "loss": 1.2143, "loss/crossentropy": 2.6018335819244385, "loss/fcd": 1.05859375, "loss/idx": 11.0, "loss/logits": 0.15575412660837173, "step": 11636 }, { "epoch": 0.1737656694465391, "grad_norm": 0.30859375, "grad_norm_var": 0.0006054202715555827, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.622575044631958, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2072216048836708, "step": 11637 }, { "epoch": 0.17378060161715408, "grad_norm": 0.3515625, "grad_norm_var": 0.0006881992022196452, "learning_rate": 0.0001, "loss": 1.5224, "loss/crossentropy": 2.521552085876465, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.19426022469997406, "step": 11638 }, { "epoch": 0.17379553378776905, "grad_norm": 0.2578125, "grad_norm_var": 0.0007681806882222493, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.609430193901062, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1674005463719368, "step": 11639 }, { "epoch": 0.17381046595838404, "grad_norm": 0.28125, "grad_norm_var": 0.000789956251780192, "learning_rate": 0.0001, "loss": 1.2943, "loss/crossentropy": 2.8136556148529053, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1731683909893036, "step": 11640 }, { "epoch": 0.17382539812899903, "grad_norm": 0.306640625, "grad_norm_var": 0.0007934530576070149, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.6520395278930664, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22474145144224167, "step": 11641 }, { "epoch": 0.17384033029961402, "grad_norm": 0.29296875, "grad_norm_var": 0.0007960279782613118, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.642517328262329, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.24274159967899323, "step": 11642 }, { "epoch": 0.17385526247022898, "grad_norm": 0.3203125, "grad_norm_var": 0.0007902105649312337, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.7010257244110107, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20446815341711044, "step": 11643 }, { "epoch": 0.17387019464084397, "grad_norm": 0.306640625, "grad_norm_var": 0.0007931669553120931, "learning_rate": 0.0001, "loss": 1.4884, "loss/crossentropy": 2.5805681943893433, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21105653047561646, "step": 11644 }, { "epoch": 0.17388512681145896, "grad_norm": 0.2734375, "grad_norm_var": 0.0008189161618550618, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.6208791732788086, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17383920401334763, "step": 11645 }, { "epoch": 0.17390005898207392, "grad_norm": 0.34765625, "grad_norm_var": 0.0009818355242411296, "learning_rate": 0.0001, "loss": 1.6535, "loss/crossentropy": 2.8295516967773438, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2668028026819229, "step": 11646 }, { "epoch": 0.1739149911526889, "grad_norm": 0.271484375, "grad_norm_var": 0.0010319034258524577, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.596735715866089, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2009800598025322, "step": 11647 }, { "epoch": 0.1739299233233039, "grad_norm": 0.318359375, "grad_norm_var": 0.0009475032488505046, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.9046541452407837, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2231043130159378, "step": 11648 }, { "epoch": 0.1739448554939189, "grad_norm": 0.29296875, "grad_norm_var": 0.0009518901507059734, "learning_rate": 0.0001, "loss": 1.3901, "loss/crossentropy": 2.665204405784607, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19479478150606155, "step": 11649 }, { "epoch": 0.17395978766453385, "grad_norm": 0.314453125, "grad_norm_var": 0.000839833418528239, "learning_rate": 0.0001, "loss": 1.4694, "loss/crossentropy": 2.7382960319519043, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2232571318745613, "step": 11650 }, { "epoch": 0.17397471983514884, "grad_norm": 0.310546875, "grad_norm_var": 0.000847462813059489, "learning_rate": 0.0001, "loss": 1.7794, "loss/crossentropy": 2.391031861305237, "loss/fcd": 1.4921875, "loss/idx": 11.0, "loss/logits": 0.28718385100364685, "step": 11651 }, { "epoch": 0.17398965200576383, "grad_norm": 0.2890625, "grad_norm_var": 0.00066070556640625, "learning_rate": 0.0001, "loss": 1.3132, "loss/crossentropy": 2.6665892601013184, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1765216514468193, "step": 11652 }, { "epoch": 0.1740045841763788, "grad_norm": 0.31640625, "grad_norm_var": 0.000670623779296875, "learning_rate": 0.0001, "loss": 1.4367, "loss/crossentropy": 2.5476046800613403, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20228708535432816, "step": 11653 }, { "epoch": 0.17401951634699378, "grad_norm": 0.326171875, "grad_norm_var": 0.0005472660064697266, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.5186225175857544, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17279884964227676, "step": 11654 }, { "epoch": 0.17403444851760877, "grad_norm": 0.3203125, "grad_norm_var": 0.00042621294657389325, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.604740262031555, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20987220853567123, "step": 11655 }, { "epoch": 0.17404938068822373, "grad_norm": 0.373046875, "grad_norm_var": 0.0006555557250976562, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.5744640827178955, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2399355247616768, "step": 11656 }, { "epoch": 0.17406431285883872, "grad_norm": 0.298828125, "grad_norm_var": 0.0006642023722330729, "learning_rate": 0.0001, "loss": 1.4171, "loss/crossentropy": 2.6532399654388428, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20229826122522354, "step": 11657 }, { "epoch": 0.1740792450294537, "grad_norm": 0.3046875, "grad_norm_var": 0.0006449381510416667, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.397530198097229, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1803276166319847, "step": 11658 }, { "epoch": 0.1740941772000687, "grad_norm": 0.345703125, "grad_norm_var": 0.0007149855295817058, "learning_rate": 0.0001, "loss": 1.6566, "loss/crossentropy": 2.4527074098587036, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2621077746152878, "step": 11659 }, { "epoch": 0.17410910937068366, "grad_norm": 0.36328125, "grad_norm_var": 0.0008666356404622396, "learning_rate": 0.0001, "loss": 1.38, "loss/crossentropy": 2.4671071767807007, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19638878852128983, "step": 11660 }, { "epoch": 0.17412404154129865, "grad_norm": 0.3828125, "grad_norm_var": 0.0009841283162434896, "learning_rate": 0.0001, "loss": 1.654, "loss/crossentropy": 2.7957812547683716, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2595120072364807, "step": 11661 }, { "epoch": 0.17413897371191364, "grad_norm": 0.32421875, "grad_norm_var": 0.0009429295857747396, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.647334575653076, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18580397963523865, "step": 11662 }, { "epoch": 0.1741539058825286, "grad_norm": 0.306640625, "grad_norm_var": 0.0007832845052083334, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.709883451461792, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22163313627243042, "step": 11663 }, { "epoch": 0.1741688380531436, "grad_norm": 0.27734375, "grad_norm_var": 0.0009204705556233723, "learning_rate": 0.0001, "loss": 1.2837, "loss/crossentropy": 2.695591688156128, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16264302283525467, "step": 11664 }, { "epoch": 0.17418377022375858, "grad_norm": 0.28515625, "grad_norm_var": 0.0009541670481363932, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.5871392488479614, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19370239228010178, "step": 11665 }, { "epoch": 0.17419870239437354, "grad_norm": 0.271484375, "grad_norm_var": 0.0011080265045166015, "learning_rate": 0.0001, "loss": 1.3952, "loss/crossentropy": 2.4407204389572144, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20770111680030823, "step": 11666 }, { "epoch": 0.17421363456498853, "grad_norm": 0.3359375, "grad_norm_var": 0.0011214574178059896, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.609160304069519, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2021438330411911, "step": 11667 }, { "epoch": 0.17422856673560352, "grad_norm": 0.333984375, "grad_norm_var": 0.001061868667602539, "learning_rate": 0.0001, "loss": 1.41, "loss/crossentropy": 2.7871474027633667, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20295003056526184, "step": 11668 }, { "epoch": 0.1742434989062185, "grad_norm": 0.27734375, "grad_norm_var": 0.0011909325917561849, "learning_rate": 0.0001, "loss": 1.3777, "loss/crossentropy": 2.488290548324585, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1862499862909317, "step": 11669 }, { "epoch": 0.17425843107683348, "grad_norm": 0.3125, "grad_norm_var": 0.0011921564737955729, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.5571300983428955, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20448102802038193, "step": 11670 }, { "epoch": 0.17427336324744847, "grad_norm": 0.33203125, "grad_norm_var": 0.0012018839518229167, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.602324366569519, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18556033819913864, "step": 11671 }, { "epoch": 0.17428829541806345, "grad_norm": 0.314453125, "grad_norm_var": 0.001004473368326823, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.483136534690857, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1878969892859459, "step": 11672 }, { "epoch": 0.17430322758867842, "grad_norm": 0.25, "grad_norm_var": 0.00126951535542806, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.6469318866729736, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.165696419775486, "step": 11673 }, { "epoch": 0.1743181597592934, "grad_norm": 0.310546875, "grad_norm_var": 0.0012646993001302084, "learning_rate": 0.0001, "loss": 1.6373, "loss/crossentropy": 2.3702069520950317, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.2700943201780319, "step": 11674 }, { "epoch": 0.1743330919299084, "grad_norm": 0.3125, "grad_norm_var": 0.001193094253540039, "learning_rate": 0.0001, "loss": 1.4408, "loss/crossentropy": 2.7501035928726196, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.22592541575431824, "step": 11675 }, { "epoch": 0.17434802410052339, "grad_norm": 0.43359375, "grad_norm_var": 0.0019838809967041016, "learning_rate": 0.0001, "loss": 1.6368, "loss/crossentropy": 2.7233457565307617, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2540139853954315, "step": 11676 }, { "epoch": 0.17436295627113835, "grad_norm": 0.326171875, "grad_norm_var": 0.001681963602701823, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.822572708129883, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1957070156931877, "step": 11677 }, { "epoch": 0.17437788844175334, "grad_norm": 0.33984375, "grad_norm_var": 0.0017211278279622396, "learning_rate": 0.0001, "loss": 1.4801, "loss/crossentropy": 2.331040143966675, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21053798496723175, "step": 11678 }, { "epoch": 0.17439282061236833, "grad_norm": 0.28515625, "grad_norm_var": 0.0017702579498291016, "learning_rate": 0.0001, "loss": 1.402, "loss/crossentropy": 2.7211508750915527, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20274092257022858, "step": 11679 }, { "epoch": 0.1744077527829833, "grad_norm": 0.283203125, "grad_norm_var": 0.0017450332641601562, "learning_rate": 0.0001, "loss": 1.3695, "loss/crossentropy": 2.7268184423446655, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1897958144545555, "step": 11680 }, { "epoch": 0.17442268495359828, "grad_norm": 0.29296875, "grad_norm_var": 0.001720110575358073, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.429724097251892, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22778117656707764, "step": 11681 }, { "epoch": 0.17443761712421327, "grad_norm": 0.26953125, "grad_norm_var": 0.0017312208811442057, "learning_rate": 0.0001, "loss": 1.319, "loss/crossentropy": 2.5011357069015503, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17833317816257477, "step": 11682 }, { "epoch": 0.17445254929482823, "grad_norm": 0.310546875, "grad_norm_var": 0.0016942342122395833, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 3.073646068572998, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21346106380224228, "step": 11683 }, { "epoch": 0.17446748146544322, "grad_norm": 0.29296875, "grad_norm_var": 0.0016765435536702474, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.573145627975464, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.210672989487648, "step": 11684 }, { "epoch": 0.1744824136360582, "grad_norm": 0.341796875, "grad_norm_var": 0.0016644795735677083, "learning_rate": 0.0001, "loss": 1.525, "loss/crossentropy": 2.540699601173401, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22425169497728348, "step": 11685 }, { "epoch": 0.1744973458066732, "grad_norm": 0.62890625, "grad_norm_var": 0.00790093739827474, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.529260039329529, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.167419895529747, "step": 11686 }, { "epoch": 0.17451227797728816, "grad_norm": 0.326171875, "grad_norm_var": 0.007903655370076498, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.42505943775177, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20735567063093185, "step": 11687 }, { "epoch": 0.17452721014790315, "grad_norm": 0.396484375, "grad_norm_var": 0.008127959569295247, "learning_rate": 0.0001, "loss": 1.5046, "loss/crossentropy": 2.4991484880447388, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22723299264907837, "step": 11688 }, { "epoch": 0.17454214231851814, "grad_norm": 0.302734375, "grad_norm_var": 0.0076863606770833336, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.650050401687622, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2015899121761322, "step": 11689 }, { "epoch": 0.1745570744891331, "grad_norm": 0.283203125, "grad_norm_var": 0.007843462626139323, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.6364845037460327, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.172855444252491, "step": 11690 }, { "epoch": 0.1745720066597481, "grad_norm": 0.30078125, "grad_norm_var": 0.007893625895182292, "learning_rate": 0.0001, "loss": 1.3925, "loss/crossentropy": 2.890648603439331, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18937164545059204, "step": 11691 }, { "epoch": 0.17458693883036308, "grad_norm": 0.326171875, "grad_norm_var": 0.007251087824503581, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.65764844417572, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21064744144678116, "step": 11692 }, { "epoch": 0.17460187100097807, "grad_norm": 0.3125, "grad_norm_var": 0.007272783915201823, "learning_rate": 0.0001, "loss": 1.4426, "loss/crossentropy": 2.7264331579208374, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.208260178565979, "step": 11693 }, { "epoch": 0.17461680317159303, "grad_norm": 0.30859375, "grad_norm_var": 0.007296180725097657, "learning_rate": 0.0001, "loss": 1.4397, "loss/crossentropy": 2.596185564994812, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.18575549870729446, "step": 11694 }, { "epoch": 0.17463173534220802, "grad_norm": 0.296875, "grad_norm_var": 0.007236480712890625, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.459129810333252, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19910898059606552, "step": 11695 }, { "epoch": 0.174646667512823, "grad_norm": 0.271484375, "grad_norm_var": 0.007317543029785156, "learning_rate": 0.0001, "loss": 1.3452, "loss/crossentropy": 2.6056102514266968, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1850111410021782, "step": 11696 }, { "epoch": 0.17466159968343797, "grad_norm": 0.2734375, "grad_norm_var": 0.007434844970703125, "learning_rate": 0.0001, "loss": 1.4712, "loss/crossentropy": 2.4309946298599243, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22121717780828476, "step": 11697 }, { "epoch": 0.17467653185405296, "grad_norm": 0.298828125, "grad_norm_var": 0.007261514663696289, "learning_rate": 0.0001, "loss": 1.4187, "loss/crossentropy": 2.5381652116775513, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19999364763498306, "step": 11698 }, { "epoch": 0.17469146402466795, "grad_norm": 0.31640625, "grad_norm_var": 0.007248878479003906, "learning_rate": 0.0001, "loss": 1.4146, "loss/crossentropy": 2.5059149265289307, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19585542380809784, "step": 11699 }, { "epoch": 0.17470639619528291, "grad_norm": 0.318359375, "grad_norm_var": 0.007164367039998372, "learning_rate": 0.0001, "loss": 1.5457, "loss/crossentropy": 2.459322214126587, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.23322797566652298, "step": 11700 }, { "epoch": 0.1747213283658979, "grad_norm": 0.279296875, "grad_norm_var": 0.007322041193644205, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.41880202293396, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20188745111227036, "step": 11701 }, { "epoch": 0.1747362605365129, "grad_norm": 0.291015625, "grad_norm_var": 0.000879351298014323, "learning_rate": 0.0001, "loss": 1.4356, "loss/crossentropy": 2.5282626152038574, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19345035403966904, "step": 11702 }, { "epoch": 0.17475119270712788, "grad_norm": 0.357421875, "grad_norm_var": 0.001022783915201823, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.3816367387771606, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.22590914368629456, "step": 11703 }, { "epoch": 0.17476612487774285, "grad_norm": 0.30078125, "grad_norm_var": 0.00047059059143066405, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.8444565534591675, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1860872209072113, "step": 11704 }, { "epoch": 0.17478105704835784, "grad_norm": 0.298828125, "grad_norm_var": 0.00047135353088378906, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.7505457401275635, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20374572277069092, "step": 11705 }, { "epoch": 0.17479598921897282, "grad_norm": 0.2734375, "grad_norm_var": 0.0005019505818684896, "learning_rate": 0.0001, "loss": 1.4133, "loss/crossentropy": 2.466788172721863, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21014294028282166, "step": 11706 }, { "epoch": 0.1748109213895878, "grad_norm": 0.310546875, "grad_norm_var": 0.0005069573720296224, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.7848554849624634, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19844932854175568, "step": 11707 }, { "epoch": 0.17482585356020278, "grad_norm": 0.30859375, "grad_norm_var": 0.0004699071248372396, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.7208235263824463, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1676895096898079, "step": 11708 }, { "epoch": 0.17484078573081777, "grad_norm": 0.31640625, "grad_norm_var": 0.000476837158203125, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.657456159591675, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16772479563951492, "step": 11709 }, { "epoch": 0.17485571790143276, "grad_norm": 0.287109375, "grad_norm_var": 0.00048470497131347656, "learning_rate": 0.0001, "loss": 1.4143, "loss/crossentropy": 2.532216191291809, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1955561861395836, "step": 11710 }, { "epoch": 0.17487065007204772, "grad_norm": 0.26171875, "grad_norm_var": 0.0005762577056884766, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.8012969493865967, "loss/fcd": 1.06640625, "loss/idx": 11.0, "loss/logits": 0.15485725551843643, "step": 11711 }, { "epoch": 0.1748855822426627, "grad_norm": 0.298828125, "grad_norm_var": 0.0005273024241129557, "learning_rate": 0.0001, "loss": 1.4536, "loss/crossentropy": 2.722045660018921, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2113809734582901, "step": 11712 }, { "epoch": 0.1749005144132777, "grad_norm": 0.29296875, "grad_norm_var": 0.00048343340555826825, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.5338516235351562, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18338535726070404, "step": 11713 }, { "epoch": 0.17491544658389266, "grad_norm": 0.357421875, "grad_norm_var": 0.0006837050120035807, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.7330182790756226, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17625421285629272, "step": 11714 }, { "epoch": 0.17493037875450765, "grad_norm": 0.267578125, "grad_norm_var": 0.0007540384928385416, "learning_rate": 0.0001, "loss": 1.2752, "loss/crossentropy": 2.512628674507141, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.16193998605012894, "step": 11715 }, { "epoch": 0.17494531092512264, "grad_norm": 0.283203125, "grad_norm_var": 0.0007511774698893229, "learning_rate": 0.0001, "loss": 1.4356, "loss/crossentropy": 2.478790760040283, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2012692242860794, "step": 11716 }, { "epoch": 0.1749602430957376, "grad_norm": 0.26171875, "grad_norm_var": 0.0008168379465738932, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.6377477645874023, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1911262720823288, "step": 11717 }, { "epoch": 0.1749751752663526, "grad_norm": 0.255859375, "grad_norm_var": 0.0009267012278238933, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.438913345336914, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19528929889202118, "step": 11718 }, { "epoch": 0.17499010743696758, "grad_norm": 0.380859375, "grad_norm_var": 0.0011536757151285808, "learning_rate": 0.0001, "loss": 1.5929, "loss/crossentropy": 2.861270546913147, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2764942795038223, "step": 11719 }, { "epoch": 0.17500503960758257, "grad_norm": 0.2890625, "grad_norm_var": 0.0011567274729410808, "learning_rate": 0.0001, "loss": 1.4847, "loss/crossentropy": 2.566356062889099, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2269086390733719, "step": 11720 }, { "epoch": 0.17501997177819753, "grad_norm": 0.294921875, "grad_norm_var": 0.0011564731597900391, "learning_rate": 0.0001, "loss": 1.4462, "loss/crossentropy": 2.4404146671295166, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20401570200920105, "step": 11721 }, { "epoch": 0.17503490394881252, "grad_norm": 0.287109375, "grad_norm_var": 0.001126543680826823, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.6258658170700073, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20351167768239975, "step": 11722 }, { "epoch": 0.1750498361194275, "grad_norm": 0.3125, "grad_norm_var": 0.0011302789052327475, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.565374732017517, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2034522145986557, "step": 11723 }, { "epoch": 0.17506476829004247, "grad_norm": 0.34375, "grad_norm_var": 0.0012607415517171224, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.496368408203125, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18379799276590347, "step": 11724 }, { "epoch": 0.17507970046065746, "grad_norm": 0.328125, "grad_norm_var": 0.0012958367665608724, "learning_rate": 0.0001, "loss": 1.526, "loss/crossentropy": 2.5137290954589844, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22524355351924896, "step": 11725 }, { "epoch": 0.17509463263127245, "grad_norm": 0.26953125, "grad_norm_var": 0.0013457616170247397, "learning_rate": 0.0001, "loss": 1.2635, "loss/crossentropy": 2.764123797416687, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16583527624607086, "step": 11726 }, { "epoch": 0.1751095648018874, "grad_norm": 0.296875, "grad_norm_var": 0.0012479146321614583, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.7827783823013306, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20321154594421387, "step": 11727 }, { "epoch": 0.1751244969725024, "grad_norm": 0.28125, "grad_norm_var": 0.0012729485829671224, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.5370761156082153, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20173685252666473, "step": 11728 }, { "epoch": 0.1751394291431174, "grad_norm": 0.3125, "grad_norm_var": 0.0012780348459879556, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.6443350315093994, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19813387840986252, "step": 11729 }, { "epoch": 0.17515436131373238, "grad_norm": 0.345703125, "grad_norm_var": 0.0011990706125895182, "learning_rate": 0.0001, "loss": 1.4667, "loss/crossentropy": 2.6625999212265015, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2049819678068161, "step": 11730 }, { "epoch": 0.17516929348434734, "grad_norm": 0.294921875, "grad_norm_var": 0.001125192642211914, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.421577215194702, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20314209908246994, "step": 11731 }, { "epoch": 0.17518422565496233, "grad_norm": 0.294921875, "grad_norm_var": 0.001103830337524414, "learning_rate": 0.0001, "loss": 1.5326, "loss/crossentropy": 2.6312960386276245, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22404919564723969, "step": 11732 }, { "epoch": 0.17519915782557732, "grad_norm": 0.29296875, "grad_norm_var": 0.0009924411773681641, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.464022397994995, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1972014456987381, "step": 11733 }, { "epoch": 0.17521408999619228, "grad_norm": 0.359375, "grad_norm_var": 0.0009831746419270834, "learning_rate": 0.0001, "loss": 1.6247, "loss/crossentropy": 2.813505172729492, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.253602109849453, "step": 11734 }, { "epoch": 0.17522902216680727, "grad_norm": 0.345703125, "grad_norm_var": 0.0007354100545247396, "learning_rate": 0.0001, "loss": 1.4839, "loss/crossentropy": 2.5744638442993164, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20657645910978317, "step": 11735 }, { "epoch": 0.17524395433742226, "grad_norm": 0.296875, "grad_norm_var": 0.0007181167602539062, "learning_rate": 0.0001, "loss": 1.498, "loss/crossentropy": 2.530177593231201, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21677083522081375, "step": 11736 }, { "epoch": 0.17525888650803725, "grad_norm": 0.30078125, "grad_norm_var": 0.000708627700805664, "learning_rate": 0.0001, "loss": 1.4884, "loss/crossentropy": 2.6327850818634033, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22672993689775467, "step": 11737 }, { "epoch": 0.17527381867865222, "grad_norm": 0.25390625, "grad_norm_var": 0.000879669189453125, "learning_rate": 0.0001, "loss": 1.3209, "loss/crossentropy": 2.4618595838546753, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1763765960931778, "step": 11738 }, { "epoch": 0.1752887508492672, "grad_norm": 0.3125, "grad_norm_var": 0.000879669189453125, "learning_rate": 0.0001, "loss": 1.3358, "loss/crossentropy": 2.4048322439193726, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1678767129778862, "step": 11739 }, { "epoch": 0.1753036830198822, "grad_norm": 0.373046875, "grad_norm_var": 0.0010725498199462891, "learning_rate": 0.0001, "loss": 1.6016, "loss/crossentropy": 2.8056849241256714, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.25784217566251755, "step": 11740 }, { "epoch": 0.17531861519049716, "grad_norm": 0.296875, "grad_norm_var": 0.0010577996571858725, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.6712125539779663, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21377086639404297, "step": 11741 }, { "epoch": 0.17533354736111215, "grad_norm": 0.41796875, "grad_norm_var": 0.00167387326558431, "learning_rate": 0.0001, "loss": 1.5769, "loss/crossentropy": 2.527892589569092, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.22145292907953262, "step": 11742 }, { "epoch": 0.17534847953172714, "grad_norm": 0.25390625, "grad_norm_var": 0.0019060611724853516, "learning_rate": 0.0001, "loss": 1.205, "loss/crossentropy": 2.437136173248291, "loss/fcd": 1.05859375, "loss/idx": 11.0, "loss/logits": 0.14642952382564545, "step": 11743 }, { "epoch": 0.1753634117023421, "grad_norm": 0.275390625, "grad_norm_var": 0.0019342422485351563, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.3910216093063354, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19920477271080017, "step": 11744 }, { "epoch": 0.1753783438729571, "grad_norm": 0.310546875, "grad_norm_var": 0.0019349257151285808, "learning_rate": 0.0001, "loss": 1.526, "loss/crossentropy": 2.596110224723816, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22519578784704208, "step": 11745 }, { "epoch": 0.17539327604357208, "grad_norm": 0.2890625, "grad_norm_var": 0.00189666748046875, "learning_rate": 0.0001, "loss": 1.4202, "loss/crossentropy": 2.753897547721863, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20929930359125137, "step": 11746 }, { "epoch": 0.17540820821418707, "grad_norm": 0.29296875, "grad_norm_var": 0.0019009749094645182, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.79179847240448, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2120945304632187, "step": 11747 }, { "epoch": 0.17542314038480203, "grad_norm": 0.310546875, "grad_norm_var": 0.0018839359283447266, "learning_rate": 0.0001, "loss": 1.4839, "loss/crossentropy": 2.5441290140151978, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2260790541768074, "step": 11748 }, { "epoch": 0.17543807255541702, "grad_norm": 0.31640625, "grad_norm_var": 0.001860666275024414, "learning_rate": 0.0001, "loss": 1.3742, "loss/crossentropy": 2.4871182441711426, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18278122693300247, "step": 11749 }, { "epoch": 0.175453004726032, "grad_norm": 0.33984375, "grad_norm_var": 0.0017633914947509765, "learning_rate": 0.0001, "loss": 1.5068, "loss/crossentropy": 2.4323421716690063, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2059882953763008, "step": 11750 }, { "epoch": 0.17546793689664697, "grad_norm": 0.318359375, "grad_norm_var": 0.001685953140258789, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.5268073081970215, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19688121229410172, "step": 11751 }, { "epoch": 0.17548286906726196, "grad_norm": 0.28515625, "grad_norm_var": 0.001714944839477539, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.6741336584091187, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.2048802226781845, "step": 11752 }, { "epoch": 0.17549780123787695, "grad_norm": 0.255859375, "grad_norm_var": 0.0018915176391601563, "learning_rate": 0.0001, "loss": 1.3464, "loss/crossentropy": 2.6377365589141846, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.19407058507204056, "step": 11753 }, { "epoch": 0.17551273340849194, "grad_norm": 0.361328125, "grad_norm_var": 0.0018609205881754557, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.7483632564544678, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.181263767182827, "step": 11754 }, { "epoch": 0.1755276655791069, "grad_norm": 0.314453125, "grad_norm_var": 0.0018610000610351563, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.583880305290222, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19276370108127594, "step": 11755 }, { "epoch": 0.1755425977497219, "grad_norm": 0.2890625, "grad_norm_var": 0.0016320387522379557, "learning_rate": 0.0001, "loss": 1.2892, "loss/crossentropy": 2.322726607322693, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16808252036571503, "step": 11756 }, { "epoch": 0.17555752992033688, "grad_norm": 0.296875, "grad_norm_var": 0.0016320387522379557, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.526923894882202, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19784387946128845, "step": 11757 }, { "epoch": 0.17557246209095184, "grad_norm": 0.271484375, "grad_norm_var": 0.0008249918619791667, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.596550226211548, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1824903041124344, "step": 11758 }, { "epoch": 0.17558739426156683, "grad_norm": 0.314453125, "grad_norm_var": 0.0006914615631103515, "learning_rate": 0.0001, "loss": 1.3388, "loss/crossentropy": 2.342639446258545, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17086409032344818, "step": 11759 }, { "epoch": 0.17560232643218182, "grad_norm": 0.306640625, "grad_norm_var": 0.0006390730539957682, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.9550050497055054, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16652941703796387, "step": 11760 }, { "epoch": 0.17561725860279678, "grad_norm": 0.296875, "grad_norm_var": 0.0006398518880208333, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.7877689599990845, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19423894584178925, "step": 11761 }, { "epoch": 0.17563219077341177, "grad_norm": 0.3125, "grad_norm_var": 0.0006284077962239583, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.746116518974304, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19494802504777908, "step": 11762 }, { "epoch": 0.17564712294402676, "grad_norm": 0.30859375, "grad_norm_var": 0.0006182352701822917, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.7159159183502197, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18569490313529968, "step": 11763 }, { "epoch": 0.17566205511464175, "grad_norm": 0.40625, "grad_norm_var": 0.0012467543284098307, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.9253772497177124, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21922807395458221, "step": 11764 }, { "epoch": 0.1756769872852567, "grad_norm": 0.306640625, "grad_norm_var": 0.0012471516927083333, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.349555492401123, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.23213008046150208, "step": 11765 }, { "epoch": 0.1756919194558717, "grad_norm": 0.2890625, "grad_norm_var": 0.001216570536295573, "learning_rate": 0.0001, "loss": 1.5326, "loss/crossentropy": 2.180397629737854, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22013919055461884, "step": 11766 }, { "epoch": 0.1757068516264867, "grad_norm": 0.271484375, "grad_norm_var": 0.0012913386027018229, "learning_rate": 0.0001, "loss": 1.3143, "loss/crossentropy": 2.496436357498169, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.18152840435504913, "step": 11767 }, { "epoch": 0.17572178379710165, "grad_norm": 0.34375, "grad_norm_var": 0.0013476053873697916, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.562270998954773, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2535451203584671, "step": 11768 }, { "epoch": 0.17573671596771664, "grad_norm": 0.291015625, "grad_norm_var": 0.001175371805826823, "learning_rate": 0.0001, "loss": 1.3629, "loss/crossentropy": 2.5151028633117676, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18714969605207443, "step": 11769 }, { "epoch": 0.17575164813833163, "grad_norm": 0.32421875, "grad_norm_var": 0.001013803482055664, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.5615761280059814, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20259808003902435, "step": 11770 }, { "epoch": 0.17576658030894662, "grad_norm": 0.306640625, "grad_norm_var": 0.0010118961334228515, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.663925051689148, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2141619175672531, "step": 11771 }, { "epoch": 0.17578151247956159, "grad_norm": 0.3125, "grad_norm_var": 0.000985574722290039, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.6171412467956543, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2111646682024002, "step": 11772 }, { "epoch": 0.17579644465017658, "grad_norm": 0.359375, "grad_norm_var": 0.0011208693186442057, "learning_rate": 0.0001, "loss": 1.6869, "loss/crossentropy": 2.8082081079483032, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.2688908353447914, "step": 11773 }, { "epoch": 0.17581137682079157, "grad_norm": 0.296875, "grad_norm_var": 0.00101776123046875, "learning_rate": 0.0001, "loss": 1.4407, "loss/crossentropy": 2.6855767965316772, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21413695067167282, "step": 11774 }, { "epoch": 0.17582630899140653, "grad_norm": 0.419921875, "grad_norm_var": 0.0016992568969726562, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.567524790763855, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18359459936618805, "step": 11775 }, { "epoch": 0.17584124116202152, "grad_norm": 0.283203125, "grad_norm_var": 0.0017816543579101563, "learning_rate": 0.0001, "loss": 1.4837, "loss/crossentropy": 2.5583648681640625, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.24537719786167145, "step": 11776 }, { "epoch": 0.1758561733326365, "grad_norm": 0.3046875, "grad_norm_var": 0.0017608006795247395, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.5953643321990967, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17272259294986725, "step": 11777 }, { "epoch": 0.17587110550325147, "grad_norm": 0.3359375, "grad_norm_var": 0.0017684300740559895, "learning_rate": 0.0001, "loss": 1.5573, "loss/crossentropy": 2.576547861099243, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.2291855812072754, "step": 11778 }, { "epoch": 0.17588603767386646, "grad_norm": 0.34375, "grad_norm_var": 0.0017804463704427084, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.7925251722335815, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21158578991889954, "step": 11779 }, { "epoch": 0.17590096984448145, "grad_norm": 0.28515625, "grad_norm_var": 0.0013803482055664063, "learning_rate": 0.0001, "loss": 1.4342, "loss/crossentropy": 2.4395437240600586, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19980232417583466, "step": 11780 }, { "epoch": 0.17591590201509644, "grad_norm": 0.333984375, "grad_norm_var": 0.0013888041178385417, "learning_rate": 0.0001, "loss": 1.5151, "loss/crossentropy": 2.4659461975097656, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21818207204341888, "step": 11781 }, { "epoch": 0.1759308341857114, "grad_norm": 0.2734375, "grad_norm_var": 0.0014661153157552083, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.631111264228821, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19454489648342133, "step": 11782 }, { "epoch": 0.1759457663563264, "grad_norm": 0.275390625, "grad_norm_var": 0.0014429092407226562, "learning_rate": 0.0001, "loss": 1.3839, "loss/crossentropy": 2.5086140632629395, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.2002779096364975, "step": 11783 }, { "epoch": 0.17596069852694138, "grad_norm": 0.296875, "grad_norm_var": 0.0014200210571289062, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.395174264907837, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20433330535888672, "step": 11784 }, { "epoch": 0.17597563069755634, "grad_norm": 0.287109375, "grad_norm_var": 0.001433563232421875, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.552749276161194, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18299901485443115, "step": 11785 }, { "epoch": 0.17599056286817133, "grad_norm": 0.4921875, "grad_norm_var": 0.003404680887858073, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.4668396711349487, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22986875474452972, "step": 11786 }, { "epoch": 0.17600549503878632, "grad_norm": 0.31640625, "grad_norm_var": 0.0033861637115478516, "learning_rate": 0.0001, "loss": 1.3707, "loss/crossentropy": 2.675015449523926, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19102609157562256, "step": 11787 }, { "epoch": 0.17602042720940128, "grad_norm": 0.32421875, "grad_norm_var": 0.003373575210571289, "learning_rate": 0.0001, "loss": 1.3849, "loss/crossentropy": 2.586733818054199, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.17789269983768463, "step": 11788 }, { "epoch": 0.17603535938001627, "grad_norm": 0.248046875, "grad_norm_var": 0.003664398193359375, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.4641653299331665, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19012627750635147, "step": 11789 }, { "epoch": 0.17605029155063126, "grad_norm": 0.27734375, "grad_norm_var": 0.003748003641764323, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.7658591270446777, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18374331295490265, "step": 11790 }, { "epoch": 0.17606522372124625, "grad_norm": 0.3046875, "grad_norm_var": 0.0030212243398030597, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.805048704147339, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.23386342823505402, "step": 11791 }, { "epoch": 0.1760801558918612, "grad_norm": 0.33984375, "grad_norm_var": 0.0030087788899739584, "learning_rate": 0.0001, "loss": 1.4741, "loss/crossentropy": 2.6901159286499023, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21628788858652115, "step": 11792 }, { "epoch": 0.1760950880624762, "grad_norm": 0.353515625, "grad_norm_var": 0.0030910332997639975, "learning_rate": 0.0001, "loss": 1.4781, "loss/crossentropy": 2.6063694953918457, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21634910255670547, "step": 11793 }, { "epoch": 0.1761100202330912, "grad_norm": 0.259765625, "grad_norm_var": 0.0032714207967122397, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.3876224756240845, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19895616173744202, "step": 11794 }, { "epoch": 0.17612495240370615, "grad_norm": 0.330078125, "grad_norm_var": 0.0032274723052978516, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.4946099519729614, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19697143882513046, "step": 11795 }, { "epoch": 0.17613988457432114, "grad_norm": 0.34765625, "grad_norm_var": 0.0032447655995686848, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.54047429561615, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22210585325956345, "step": 11796 }, { "epoch": 0.17615481674493613, "grad_norm": 0.3046875, "grad_norm_var": 0.0032292683919270832, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.7967430353164673, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19319408386945724, "step": 11797 }, { "epoch": 0.17616974891555112, "grad_norm": 0.291015625, "grad_norm_var": 0.00315244992574056, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.779055118560791, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.22091592103242874, "step": 11798 }, { "epoch": 0.17618468108616608, "grad_norm": 0.30859375, "grad_norm_var": 0.0030435562133789063, "learning_rate": 0.0001, "loss": 1.3041, "loss/crossentropy": 2.535733938217163, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16347260773181915, "step": 11799 }, { "epoch": 0.17619961325678107, "grad_norm": 0.310546875, "grad_norm_var": 0.0030174096425374347, "learning_rate": 0.0001, "loss": 1.3467, "loss/crossentropy": 2.871867299079895, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1982801929116249, "step": 11800 }, { "epoch": 0.17621454542739606, "grad_norm": 0.3515625, "grad_norm_var": 0.0030074437459309896, "learning_rate": 0.0001, "loss": 1.672, "loss/crossentropy": 2.5196359157562256, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.2501152530312538, "step": 11801 }, { "epoch": 0.17622947759801103, "grad_norm": 0.322265625, "grad_norm_var": 0.0009677728017171224, "learning_rate": 0.0001, "loss": 1.4831, "loss/crossentropy": 2.861028790473938, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21743682026863098, "step": 11802 }, { "epoch": 0.17624440976862601, "grad_norm": 0.28515625, "grad_norm_var": 0.001009988784790039, "learning_rate": 0.0001, "loss": 1.3918, "loss/crossentropy": 2.4588379859924316, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19648579508066177, "step": 11803 }, { "epoch": 0.176259341939241, "grad_norm": 0.279296875, "grad_norm_var": 0.001050567626953125, "learning_rate": 0.0001, "loss": 1.3912, "loss/crossentropy": 2.340955138206482, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.20757828652858734, "step": 11804 }, { "epoch": 0.17627427410985597, "grad_norm": 0.2890625, "grad_norm_var": 0.0008326053619384765, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.589371681213379, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20776783674955368, "step": 11805 }, { "epoch": 0.17628920628047096, "grad_norm": 0.310546875, "grad_norm_var": 0.0007582982381184896, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.666753888130188, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19545845687389374, "step": 11806 }, { "epoch": 0.17630413845108595, "grad_norm": 0.333984375, "grad_norm_var": 0.0007842858632405599, "learning_rate": 0.0001, "loss": 1.5045, "loss/crossentropy": 2.49513578414917, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21546651422977448, "step": 11807 }, { "epoch": 0.17631907062170094, "grad_norm": 0.287109375, "grad_norm_var": 0.0007735570271809896, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.4506242275238037, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18914073705673218, "step": 11808 }, { "epoch": 0.1763340027923159, "grad_norm": 0.267578125, "grad_norm_var": 0.0007399876912434895, "learning_rate": 0.0001, "loss": 1.3288, "loss/crossentropy": 2.4122421741485596, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17645516991615295, "step": 11809 }, { "epoch": 0.1763489349629309, "grad_norm": 0.291015625, "grad_norm_var": 0.0006128311157226563, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.645080804824829, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20175229012966156, "step": 11810 }, { "epoch": 0.17636386713354588, "grad_norm": 0.291015625, "grad_norm_var": 0.0005873998006184895, "learning_rate": 0.0001, "loss": 1.3061, "loss/crossentropy": 2.4265758991241455, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17720626294612885, "step": 11811 }, { "epoch": 0.17637879930416084, "grad_norm": 0.34375, "grad_norm_var": 0.0005658467610677084, "learning_rate": 0.0001, "loss": 1.4468, "loss/crossentropy": 2.536829352378845, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20463332533836365, "step": 11812 }, { "epoch": 0.17639373147477583, "grad_norm": 0.326171875, "grad_norm_var": 0.0005960941314697265, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.6368407011032104, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21757326275110245, "step": 11813 }, { "epoch": 0.17640866364539082, "grad_norm": 0.34765625, "grad_norm_var": 0.0006868998209635417, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.269029140472412, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.18925347179174423, "step": 11814 }, { "epoch": 0.1764235958160058, "grad_norm": 0.294921875, "grad_norm_var": 0.0006994724273681641, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.685691714286804, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2536097317934036, "step": 11815 }, { "epoch": 0.17643852798662077, "grad_norm": 0.3203125, "grad_norm_var": 0.0007084528605143229, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.6552011966705322, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20362794399261475, "step": 11816 }, { "epoch": 0.17645346015723576, "grad_norm": 0.388671875, "grad_norm_var": 0.0010059197743733724, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.450736403465271, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23542218655347824, "step": 11817 }, { "epoch": 0.17646839232785075, "grad_norm": 0.28515625, "grad_norm_var": 0.0010370254516601563, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.570003628730774, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1946462094783783, "step": 11818 }, { "epoch": 0.1764833244984657, "grad_norm": 0.3046875, "grad_norm_var": 0.0009991963704427083, "learning_rate": 0.0001, "loss": 1.3726, "loss/crossentropy": 2.5638818740844727, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.20075158029794693, "step": 11819 }, { "epoch": 0.1764982566690807, "grad_norm": 0.294921875, "grad_norm_var": 0.0009503682454427083, "learning_rate": 0.0001, "loss": 1.4068, "loss/crossentropy": 2.8614065647125244, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1997252106666565, "step": 11820 }, { "epoch": 0.1765131888396957, "grad_norm": 0.283203125, "grad_norm_var": 0.0009696801503499349, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.5459580421447754, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2033906728029251, "step": 11821 }, { "epoch": 0.17652812101031065, "grad_norm": 0.318359375, "grad_norm_var": 0.000973367691040039, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.9797117710113525, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23072339594364166, "step": 11822 }, { "epoch": 0.17654305318092564, "grad_norm": 0.302734375, "grad_norm_var": 0.0009392897288004557, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.6909899711608887, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17626432329416275, "step": 11823 }, { "epoch": 0.17655798535154063, "grad_norm": 0.2890625, "grad_norm_var": 0.0009337743123372396, "learning_rate": 0.0001, "loss": 1.4737, "loss/crossentropy": 2.48734450340271, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2119884192943573, "step": 11824 }, { "epoch": 0.17657291752215562, "grad_norm": 0.28515625, "grad_norm_var": 0.0008552392323811849, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.481710195541382, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2017211988568306, "step": 11825 }, { "epoch": 0.17658784969277058, "grad_norm": 0.341796875, "grad_norm_var": 0.0008849938710530599, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.4890642166137695, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1863953024148941, "step": 11826 }, { "epoch": 0.17660278186338557, "grad_norm": 0.26953125, "grad_norm_var": 0.0009785334269205729, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.5481637716293335, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18263308703899384, "step": 11827 }, { "epoch": 0.17661771403400056, "grad_norm": 0.3046875, "grad_norm_var": 0.000909868876139323, "learning_rate": 0.0001, "loss": 1.5389, "loss/crossentropy": 2.619540810585022, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.23808832466602325, "step": 11828 }, { "epoch": 0.17663264620461552, "grad_norm": 0.298828125, "grad_norm_var": 0.0008969624837239583, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.5109431743621826, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19006171822547913, "step": 11829 }, { "epoch": 0.1766475783752305, "grad_norm": 0.341796875, "grad_norm_var": 0.0008682092030843099, "learning_rate": 0.0001, "loss": 1.6699, "loss/crossentropy": 2.6030784845352173, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2596946209669113, "step": 11830 }, { "epoch": 0.1766625105458455, "grad_norm": 0.296875, "grad_norm_var": 0.0008651097615559896, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.6328513622283936, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1752137839794159, "step": 11831 }, { "epoch": 0.1766774427164605, "grad_norm": 0.28515625, "grad_norm_var": 0.0008839925130208333, "learning_rate": 0.0001, "loss": 1.3439, "loss/crossentropy": 2.62945294380188, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1758960783481598, "step": 11832 }, { "epoch": 0.17669237488707545, "grad_norm": 0.318359375, "grad_norm_var": 0.00041478474934895834, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.738268494606018, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17909938097000122, "step": 11833 }, { "epoch": 0.17670730705769044, "grad_norm": 0.310546875, "grad_norm_var": 0.0004005273183186849, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.570988178253174, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20478271692991257, "step": 11834 }, { "epoch": 0.17672223922830543, "grad_norm": 0.26171875, "grad_norm_var": 0.0005054314931233724, "learning_rate": 0.0001, "loss": 1.2777, "loss/crossentropy": 2.5171810388565063, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.16439354419708252, "step": 11835 }, { "epoch": 0.1767371713989204, "grad_norm": 0.361328125, "grad_norm_var": 0.0007345676422119141, "learning_rate": 0.0001, "loss": 1.4331, "loss/crossentropy": 2.523230791091919, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.18697109818458557, "step": 11836 }, { "epoch": 0.17675210356953538, "grad_norm": 0.3046875, "grad_norm_var": 0.0007029215494791667, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.6259734630584717, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20057393610477448, "step": 11837 }, { "epoch": 0.17676703574015037, "grad_norm": 0.46875, "grad_norm_var": 0.002371072769165039, "learning_rate": 0.0001, "loss": 1.9011, "loss/crossentropy": 2.410539150238037, "loss/fcd": 1.59765625, "loss/idx": 11.0, "loss/logits": 0.30348609387874603, "step": 11838 }, { "epoch": 0.17678196791076534, "grad_norm": 0.306640625, "grad_norm_var": 0.0023656050364176434, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.6587730646133423, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.16919821500778198, "step": 11839 }, { "epoch": 0.17679690008138033, "grad_norm": 0.29296875, "grad_norm_var": 0.00235288937886556, "learning_rate": 0.0001, "loss": 1.4601, "loss/crossentropy": 2.2877551317214966, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.202278234064579, "step": 11840 }, { "epoch": 0.17681183225199532, "grad_norm": 0.294921875, "grad_norm_var": 0.0023192723592122396, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.567064642906189, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19299906492233276, "step": 11841 }, { "epoch": 0.1768267644226103, "grad_norm": 0.41015625, "grad_norm_var": 0.002844985326131185, "learning_rate": 0.0001, "loss": 1.5135, "loss/crossentropy": 2.630936026573181, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23611830174922943, "step": 11842 }, { "epoch": 0.17684169659322527, "grad_norm": 0.30859375, "grad_norm_var": 0.0026752312978108725, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.6513898372650146, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1911308541893959, "step": 11843 }, { "epoch": 0.17685662876384026, "grad_norm": 0.30078125, "grad_norm_var": 0.002685658137003581, "learning_rate": 0.0001, "loss": 1.4197, "loss/crossentropy": 2.5168625116348267, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2009301632642746, "step": 11844 }, { "epoch": 0.17687156093445525, "grad_norm": 0.3046875, "grad_norm_var": 0.002669207255045573, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.883394956588745, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2321951910853386, "step": 11845 }, { "epoch": 0.1768864931050702, "grad_norm": 0.25, "grad_norm_var": 0.0029657840728759765, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.327463150024414, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.16721080243587494, "step": 11846 }, { "epoch": 0.1769014252756852, "grad_norm": 0.27734375, "grad_norm_var": 0.003042713801066081, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.4730756282806396, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18823421001434326, "step": 11847 }, { "epoch": 0.1769163574463002, "grad_norm": 0.3203125, "grad_norm_var": 0.0029751936594645183, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.2185150384902954, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2052830010652542, "step": 11848 }, { "epoch": 0.17693128961691515, "grad_norm": 0.30859375, "grad_norm_var": 0.0029809951782226564, "learning_rate": 0.0001, "loss": 1.6778, "loss/crossentropy": 2.313242197036743, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.2715727314352989, "step": 11849 }, { "epoch": 0.17694622178753014, "grad_norm": 0.31640625, "grad_norm_var": 0.002977609634399414, "learning_rate": 0.0001, "loss": 1.6037, "loss/crossentropy": 2.476553440093994, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.25604791939258575, "step": 11850 }, { "epoch": 0.17696115395814513, "grad_norm": 0.33984375, "grad_norm_var": 0.0027728875478108725, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.4958828687667847, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22173871099948883, "step": 11851 }, { "epoch": 0.17697608612876012, "grad_norm": 0.310546875, "grad_norm_var": 0.0026737054189046225, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.287740111351013, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.17990753054618835, "step": 11852 }, { "epoch": 0.17699101829937508, "grad_norm": 0.294921875, "grad_norm_var": 0.0026992162068684897, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.7010785341262817, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19843634217977524, "step": 11853 }, { "epoch": 0.17700595046999007, "grad_norm": 0.380859375, "grad_norm_var": 0.0014282067616780598, "learning_rate": 0.0001, "loss": 1.5801, "loss/crossentropy": 2.402135968208313, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.24416027963161469, "step": 11854 }, { "epoch": 0.17702088264060506, "grad_norm": 0.3359375, "grad_norm_var": 0.0014546712239583333, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.3972374200820923, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.22951563447713852, "step": 11855 }, { "epoch": 0.17703581481122002, "grad_norm": 0.291015625, "grad_norm_var": 0.0014607588450113933, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.502558946609497, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17205557972192764, "step": 11856 }, { "epoch": 0.177050746981835, "grad_norm": 0.328125, "grad_norm_var": 0.0014394124348958333, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.9100245237350464, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17520400136709213, "step": 11857 }, { "epoch": 0.17706567915245, "grad_norm": 0.291015625, "grad_norm_var": 0.0008528232574462891, "learning_rate": 0.0001, "loss": 1.4404, "loss/crossentropy": 2.786215305328369, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21388425678014755, "step": 11858 }, { "epoch": 0.177080611323065, "grad_norm": 0.353515625, "grad_norm_var": 0.000970904032389323, "learning_rate": 0.0001, "loss": 1.6614, "loss/crossentropy": 2.5456345081329346, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.23948688805103302, "step": 11859 }, { "epoch": 0.17709554349367995, "grad_norm": 0.275390625, "grad_norm_var": 0.0010516961415608724, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.5218311548233032, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19550706446170807, "step": 11860 }, { "epoch": 0.17711047566429494, "grad_norm": 0.3046875, "grad_norm_var": 0.0010516961415608724, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.6674059629440308, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21554605662822723, "step": 11861 }, { "epoch": 0.17712540783490993, "grad_norm": 0.263671875, "grad_norm_var": 0.0009518941243489583, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.463096022605896, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.16356787085533142, "step": 11862 }, { "epoch": 0.1771403400055249, "grad_norm": 0.279296875, "grad_norm_var": 0.0009431044260660807, "learning_rate": 0.0001, "loss": 1.4443, "loss/crossentropy": 2.6262006759643555, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2099558636546135, "step": 11863 }, { "epoch": 0.17715527217613988, "grad_norm": 0.306640625, "grad_norm_var": 0.0009398778279622396, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.681917190551758, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21036025136709213, "step": 11864 }, { "epoch": 0.17717020434675487, "grad_norm": 0.349609375, "grad_norm_var": 0.0010303338368733725, "learning_rate": 0.0001, "loss": 1.547, "loss/crossentropy": 2.439856767654419, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2345489114522934, "step": 11865 }, { "epoch": 0.17718513651736983, "grad_norm": 0.265625, "grad_norm_var": 0.001174147923787435, "learning_rate": 0.0001, "loss": 1.2671, "loss/crossentropy": 2.722259759902954, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1615854650735855, "step": 11866 }, { "epoch": 0.17720006868798482, "grad_norm": 0.302734375, "grad_norm_var": 0.0011158625284830729, "learning_rate": 0.0001, "loss": 1.4343, "loss/crossentropy": 2.706411838531494, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21555887162685394, "step": 11867 }, { "epoch": 0.1772150008585998, "grad_norm": 0.361328125, "grad_norm_var": 0.0012919108072916667, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.8242151737213135, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2459365874528885, "step": 11868 }, { "epoch": 0.1772299330292148, "grad_norm": 0.265625, "grad_norm_var": 0.0014104048411051433, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.643506646156311, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17494038492441177, "step": 11869 }, { "epoch": 0.17724486519982977, "grad_norm": 0.298828125, "grad_norm_var": 0.0010525862375895183, "learning_rate": 0.0001, "loss": 1.4362, "loss/crossentropy": 2.7168349027633667, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20961587876081467, "step": 11870 }, { "epoch": 0.17725979737044475, "grad_norm": 0.287109375, "grad_norm_var": 0.0009973526000976562, "learning_rate": 0.0001, "loss": 1.3907, "loss/crossentropy": 2.6103938817977905, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1914929747581482, "step": 11871 }, { "epoch": 0.17727472954105974, "grad_norm": 0.388671875, "grad_norm_var": 0.0014567057291666667, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.4601835012435913, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19888126105070114, "step": 11872 }, { "epoch": 0.1772896617116747, "grad_norm": 0.3125, "grad_norm_var": 0.0014292399088541667, "learning_rate": 0.0001, "loss": 1.4493, "loss/crossentropy": 2.727346897125244, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2071262076497078, "step": 11873 }, { "epoch": 0.1773045938822897, "grad_norm": 0.306640625, "grad_norm_var": 0.0014119466145833333, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.5384886264801025, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20264476537704468, "step": 11874 }, { "epoch": 0.17731952605290469, "grad_norm": 0.38671875, "grad_norm_var": 0.0016840457916259765, "learning_rate": 0.0001, "loss": 1.5315, "loss/crossentropy": 2.9086785316467285, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22685404866933823, "step": 11875 }, { "epoch": 0.17733445822351968, "grad_norm": 0.3359375, "grad_norm_var": 0.0016362508138020833, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.6220474243164062, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20870907604694366, "step": 11876 }, { "epoch": 0.17734939039413464, "grad_norm": 0.333984375, "grad_norm_var": 0.00165556271870931, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.771863341331482, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19635893404483795, "step": 11877 }, { "epoch": 0.17736432256474963, "grad_norm": 0.306640625, "grad_norm_var": 0.0014751275380452474, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.772151231765747, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1863466128706932, "step": 11878 }, { "epoch": 0.17737925473536462, "grad_norm": 0.291015625, "grad_norm_var": 0.0014232476552327475, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.580712914466858, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17835619300603867, "step": 11879 }, { "epoch": 0.17739418690597958, "grad_norm": 0.34375, "grad_norm_var": 0.0014495213826497396, "learning_rate": 0.0001, "loss": 1.3838, "loss/crossentropy": 2.5906649827957153, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18063712865114212, "step": 11880 }, { "epoch": 0.17740911907659457, "grad_norm": 0.3046875, "grad_norm_var": 0.0014045556386311849, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 2.750128746032715, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18748770654201508, "step": 11881 }, { "epoch": 0.17742405124720956, "grad_norm": 0.30078125, "grad_norm_var": 0.001235183080037435, "learning_rate": 0.0001, "loss": 1.4341, "loss/crossentropy": 2.511388421058655, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19585182517766953, "step": 11882 }, { "epoch": 0.17743898341782452, "grad_norm": 0.298828125, "grad_norm_var": 0.0012453556060791015, "learning_rate": 0.0001, "loss": 1.545, "loss/crossentropy": 2.5681251287460327, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2324976921081543, "step": 11883 }, { "epoch": 0.1774539155884395, "grad_norm": 0.44921875, "grad_norm_var": 0.002210235595703125, "learning_rate": 0.0001, "loss": 1.5993, "loss/crossentropy": 2.681620955467224, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2985598221421242, "step": 11884 }, { "epoch": 0.1774688477590545, "grad_norm": 0.26171875, "grad_norm_var": 0.0022424697875976563, "learning_rate": 0.0001, "loss": 1.2744, "loss/crossentropy": 2.734686255455017, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.17286550253629684, "step": 11885 }, { "epoch": 0.1774837799296695, "grad_norm": 0.333984375, "grad_norm_var": 0.002194976806640625, "learning_rate": 0.0001, "loss": 1.6088, "loss/crossentropy": 2.440048336982727, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.24159858375787735, "step": 11886 }, { "epoch": 0.17749871210028445, "grad_norm": 0.302734375, "grad_norm_var": 0.0021258036295572916, "learning_rate": 0.0001, "loss": 1.5223, "loss/crossentropy": 2.2572847604751587, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2176326885819435, "step": 11887 }, { "epoch": 0.17751364427089944, "grad_norm": 0.33984375, "grad_norm_var": 0.0018838087717692057, "learning_rate": 0.0001, "loss": 1.4686, "loss/crossentropy": 2.451033592224121, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19517727196216583, "step": 11888 }, { "epoch": 0.17752857644151443, "grad_norm": 0.427734375, "grad_norm_var": 0.0025130589803059895, "learning_rate": 0.0001, "loss": 1.3732, "loss/crossentropy": 2.864977478981018, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18570799380540848, "step": 11889 }, { "epoch": 0.1775435086121294, "grad_norm": 0.30078125, "grad_norm_var": 0.0025356133778889974, "learning_rate": 0.0001, "loss": 1.3066, "loss/crossentropy": 2.713661551475525, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16600152105093002, "step": 11890 }, { "epoch": 0.17755844078274438, "grad_norm": 0.318359375, "grad_norm_var": 0.0023325602213541668, "learning_rate": 0.0001, "loss": 1.2921, "loss/crossentropy": 2.5952948331832886, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1710517257452011, "step": 11891 }, { "epoch": 0.17757337295335937, "grad_norm": 0.27734375, "grad_norm_var": 0.002486101786295573, "learning_rate": 0.0001, "loss": 1.4009, "loss/crossentropy": 2.7423436641693115, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20168866962194443, "step": 11892 }, { "epoch": 0.17758830512397433, "grad_norm": 0.271484375, "grad_norm_var": 0.0026508967081705728, "learning_rate": 0.0001, "loss": 1.3816, "loss/crossentropy": 2.5995230674743652, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19405792653560638, "step": 11893 }, { "epoch": 0.17760323729458932, "grad_norm": 0.291015625, "grad_norm_var": 0.002695147196451823, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.706879734992981, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1856428012251854, "step": 11894 }, { "epoch": 0.1776181694652043, "grad_norm": 0.353515625, "grad_norm_var": 0.002701250712076823, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.506592035293579, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2094540297985077, "step": 11895 }, { "epoch": 0.1776331016358193, "grad_norm": 0.306640625, "grad_norm_var": 0.00268705685933431, "learning_rate": 0.0001, "loss": 1.5449, "loss/crossentropy": 2.634299159049988, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2402496486902237, "step": 11896 }, { "epoch": 0.17764803380643426, "grad_norm": 0.33203125, "grad_norm_var": 0.0026737054189046225, "learning_rate": 0.0001, "loss": 1.3663, "loss/crossentropy": 2.6302154064178467, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.17490362375974655, "step": 11897 }, { "epoch": 0.17766296597704925, "grad_norm": 0.353515625, "grad_norm_var": 0.0026921590169270834, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.477660298347473, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.16695644706487656, "step": 11898 }, { "epoch": 0.17767789814766424, "grad_norm": 0.306640625, "grad_norm_var": 0.0026674906412760416, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.5428982973098755, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20907442271709442, "step": 11899 }, { "epoch": 0.1776928303182792, "grad_norm": 0.30859375, "grad_norm_var": 0.0016054789225260417, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.7131868600845337, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20869717746973038, "step": 11900 }, { "epoch": 0.1777077624888942, "grad_norm": 0.296875, "grad_norm_var": 0.001419512430826823, "learning_rate": 0.0001, "loss": 1.5155, "loss/crossentropy": 2.5836671590805054, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22645633667707443, "step": 11901 }, { "epoch": 0.17772269465950918, "grad_norm": 0.3515625, "grad_norm_var": 0.0014714399973551433, "learning_rate": 0.0001, "loss": 1.6004, "loss/crossentropy": 2.563691735267639, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2371152862906456, "step": 11902 }, { "epoch": 0.17773762683012417, "grad_norm": 0.283203125, "grad_norm_var": 0.001543283462524414, "learning_rate": 0.0001, "loss": 1.3269, "loss/crossentropy": 2.6888850927352905, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1784459799528122, "step": 11903 }, { "epoch": 0.17775255900073914, "grad_norm": 0.333984375, "grad_norm_var": 0.0015298843383789063, "learning_rate": 0.0001, "loss": 1.4189, "loss/crossentropy": 2.49833881855011, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18065498024225235, "step": 11904 }, { "epoch": 0.17776749117135413, "grad_norm": 0.294921875, "grad_norm_var": 0.0007170995076497395, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.6643285751342773, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20916935801506042, "step": 11905 }, { "epoch": 0.17778242334196911, "grad_norm": 0.3046875, "grad_norm_var": 0.00071258544921875, "learning_rate": 0.0001, "loss": 1.4567, "loss/crossentropy": 2.698174476623535, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21454964578151703, "step": 11906 }, { "epoch": 0.17779735551258408, "grad_norm": 0.369140625, "grad_norm_var": 0.0009200414021809896, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.4342458248138428, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.17740748077630997, "step": 11907 }, { "epoch": 0.17781228768319907, "grad_norm": 0.328125, "grad_norm_var": 0.0008282979329427083, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.570022463798523, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2341601550579071, "step": 11908 }, { "epoch": 0.17782721985381406, "grad_norm": 0.287109375, "grad_norm_var": 0.000746917724609375, "learning_rate": 0.0001, "loss": 1.3484, "loss/crossentropy": 2.7759872674942017, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1882820501923561, "step": 11909 }, { "epoch": 0.17784215202442902, "grad_norm": 0.55078125, "grad_norm_var": 0.004000329971313476, "learning_rate": 0.0001, "loss": 1.6185, "loss/crossentropy": 2.6765748262405396, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.24350054562091827, "step": 11910 }, { "epoch": 0.177857084195044, "grad_norm": 0.421875, "grad_norm_var": 0.004460398356119792, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.4384899139404297, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19146448373794556, "step": 11911 }, { "epoch": 0.177872016365659, "grad_norm": 0.296875, "grad_norm_var": 0.00450895627339681, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.6656805276870728, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20627164095640182, "step": 11912 }, { "epoch": 0.177886948536274, "grad_norm": 0.306640625, "grad_norm_var": 0.004571978251139323, "learning_rate": 0.0001, "loss": 1.3793, "loss/crossentropy": 2.4931774139404297, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19180165231227875, "step": 11913 }, { "epoch": 0.17790188070688895, "grad_norm": 0.3203125, "grad_norm_var": 0.00456846555074056, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.681831121444702, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2017732709646225, "step": 11914 }, { "epoch": 0.17791681287750394, "grad_norm": 0.279296875, "grad_norm_var": 0.004718891779581706, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.6926398277282715, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17834249138832092, "step": 11915 }, { "epoch": 0.17793174504811893, "grad_norm": 0.419921875, "grad_norm_var": 0.005125681559244792, "learning_rate": 0.0001, "loss": 1.5718, "loss/crossentropy": 2.475012421607971, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2241268754005432, "step": 11916 }, { "epoch": 0.1779466772187339, "grad_norm": 0.326171875, "grad_norm_var": 0.005009571711222331, "learning_rate": 0.0001, "loss": 1.5174, "loss/crossentropy": 2.3860554695129395, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2243887260556221, "step": 11917 }, { "epoch": 0.17796160938934888, "grad_norm": 0.30859375, "grad_norm_var": 0.005071115493774414, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.62009334564209, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20685544610023499, "step": 11918 }, { "epoch": 0.17797654155996387, "grad_norm": 0.3203125, "grad_norm_var": 0.0048787434895833336, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.50952410697937, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19229231029748917, "step": 11919 }, { "epoch": 0.17799147373057886, "grad_norm": 0.345703125, "grad_norm_var": 0.00487511952718099, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.4055471420288086, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.24930567294359207, "step": 11920 }, { "epoch": 0.17800640590119382, "grad_norm": 0.33984375, "grad_norm_var": 0.0047160943349202475, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.803318738937378, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1917969137430191, "step": 11921 }, { "epoch": 0.1780213380718088, "grad_norm": 0.318359375, "grad_norm_var": 0.0046536763509114586, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.407426357269287, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.17865748703479767, "step": 11922 }, { "epoch": 0.1780362702424238, "grad_norm": 0.3125, "grad_norm_var": 0.0046808719635009766, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.622041344642639, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19907036423683167, "step": 11923 }, { "epoch": 0.17805120241303876, "grad_norm": 0.2890625, "grad_norm_var": 0.004851897557576497, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.484472393989563, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20289695262908936, "step": 11924 }, { "epoch": 0.17806613458365375, "grad_norm": 0.34375, "grad_norm_var": 0.004651387532552083, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.552881360054016, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1971629112958908, "step": 11925 }, { "epoch": 0.17808106675426874, "grad_norm": 0.58203125, "grad_norm_var": 0.005575052897135417, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.753144145011902, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20438691973686218, "step": 11926 }, { "epoch": 0.1780959989248837, "grad_norm": 0.30859375, "grad_norm_var": 0.0052265803019205725, "learning_rate": 0.0001, "loss": 1.3414, "loss/crossentropy": 2.558180093765259, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1773488074541092, "step": 11927 }, { "epoch": 0.1781109310954987, "grad_norm": 0.279296875, "grad_norm_var": 0.005343739191691081, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.7080157995224, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.186418816447258, "step": 11928 }, { "epoch": 0.17812586326611368, "grad_norm": 0.2890625, "grad_norm_var": 0.005435434977213541, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.4995076656341553, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.16295521706342697, "step": 11929 }, { "epoch": 0.17814079543672867, "grad_norm": 0.30078125, "grad_norm_var": 0.0055012385050455725, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.564621925354004, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18850523978471756, "step": 11930 }, { "epoch": 0.17815572760734363, "grad_norm": 0.333984375, "grad_norm_var": 0.005280494689941406, "learning_rate": 0.0001, "loss": 1.5507, "loss/crossentropy": 2.3555551767349243, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.20694825053215027, "step": 11931 }, { "epoch": 0.17817065977795862, "grad_norm": 0.31640625, "grad_norm_var": 0.004828119277954101, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.714341402053833, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19058477878570557, "step": 11932 }, { "epoch": 0.1781855919485736, "grad_norm": 0.28125, "grad_norm_var": 0.0049900690714518225, "learning_rate": 0.0001, "loss": 1.3165, "loss/crossentropy": 2.588688373565674, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.18368113040924072, "step": 11933 }, { "epoch": 0.17820052411918857, "grad_norm": 0.296875, "grad_norm_var": 0.005031077067057291, "learning_rate": 0.0001, "loss": 1.4974, "loss/crossentropy": 2.3639639616012573, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21223150193691254, "step": 11934 }, { "epoch": 0.17821545628980356, "grad_norm": 0.357421875, "grad_norm_var": 0.005076074600219726, "learning_rate": 0.0001, "loss": 1.537, "loss/crossentropy": 2.6631860733032227, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2049727961421013, "step": 11935 }, { "epoch": 0.17823038846041855, "grad_norm": 0.353515625, "grad_norm_var": 0.005095275243123373, "learning_rate": 0.0001, "loss": 1.5895, "loss/crossentropy": 2.394173502922058, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.22617443650960922, "step": 11936 }, { "epoch": 0.17824532063103354, "grad_norm": 0.345703125, "grad_norm_var": 0.00510400136311849, "learning_rate": 0.0001, "loss": 1.5184, "loss/crossentropy": 2.5945932865142822, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22541988641023636, "step": 11937 }, { "epoch": 0.1782602528016485, "grad_norm": 0.298828125, "grad_norm_var": 0.005162811279296875, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.5496408939361572, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18370166420936584, "step": 11938 }, { "epoch": 0.1782751849722635, "grad_norm": 0.40234375, "grad_norm_var": 0.005450884501139323, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.7948726415634155, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18405449390411377, "step": 11939 }, { "epoch": 0.17829011714287848, "grad_norm": 0.27734375, "grad_norm_var": 0.005533091227213542, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.6501526832580566, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20444010198116302, "step": 11940 }, { "epoch": 0.17830504931349345, "grad_norm": 0.40234375, "grad_norm_var": 0.005812517801920573, "learning_rate": 0.0001, "loss": 1.6954, "loss/crossentropy": 2.356548547744751, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.28519555926322937, "step": 11941 }, { "epoch": 0.17831998148410844, "grad_norm": 0.30078125, "grad_norm_var": 0.0016468683878580728, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.5212026834487915, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20117678493261337, "step": 11942 }, { "epoch": 0.17833491365472343, "grad_norm": 0.3359375, "grad_norm_var": 0.00164642333984375, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.5911965370178223, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20232246071100235, "step": 11943 }, { "epoch": 0.1783498458253384, "grad_norm": 0.3125, "grad_norm_var": 0.0015207767486572266, "learning_rate": 0.0001, "loss": 1.4012, "loss/crossentropy": 2.4520339965820312, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19030483812093735, "step": 11944 }, { "epoch": 0.17836477799595338, "grad_norm": 0.400390625, "grad_norm_var": 0.0017572402954101562, "learning_rate": 0.0001, "loss": 1.6292, "loss/crossentropy": 2.7634241580963135, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2580774426460266, "step": 11945 }, { "epoch": 0.17837971016656837, "grad_norm": 0.333984375, "grad_norm_var": 0.0016867160797119141, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.676390767097473, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2261389046907425, "step": 11946 }, { "epoch": 0.17839464233718336, "grad_norm": 0.2890625, "grad_norm_var": 0.001815032958984375, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.661616086959839, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16941234469413757, "step": 11947 }, { "epoch": 0.17840957450779832, "grad_norm": 0.298828125, "grad_norm_var": 0.0018698215484619141, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.661380648612976, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20931077003479004, "step": 11948 }, { "epoch": 0.1784245066784133, "grad_norm": 0.27734375, "grad_norm_var": 0.0018963972727457682, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.522536039352417, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18752498924732208, "step": 11949 }, { "epoch": 0.1784394388490283, "grad_norm": 0.28515625, "grad_norm_var": 0.001957050959269206, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.7232524156570435, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18386214971542358, "step": 11950 }, { "epoch": 0.17845437101964326, "grad_norm": 0.341796875, "grad_norm_var": 0.001914072036743164, "learning_rate": 0.0001, "loss": 1.5209, "loss/crossentropy": 2.7721667289733887, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21623767167329788, "step": 11951 }, { "epoch": 0.17846930319025825, "grad_norm": 0.3203125, "grad_norm_var": 0.0018721898396809897, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.4930559396743774, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.18963059782981873, "step": 11952 }, { "epoch": 0.17848423536087324, "grad_norm": 0.359375, "grad_norm_var": 0.0019190311431884766, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.4583784341812134, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.24536722898483276, "step": 11953 }, { "epoch": 0.1784991675314882, "grad_norm": 0.296875, "grad_norm_var": 0.0019266764322916667, "learning_rate": 0.0001, "loss": 1.3325, "loss/crossentropy": 3.022968888282776, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17229586094617844, "step": 11954 }, { "epoch": 0.1785140997021032, "grad_norm": 0.26953125, "grad_norm_var": 0.001697540283203125, "learning_rate": 0.0001, "loss": 1.3169, "loss/crossentropy": 2.497381091117859, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17235103249549866, "step": 11955 }, { "epoch": 0.17852903187271818, "grad_norm": 0.283203125, "grad_norm_var": 0.0016672611236572266, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.7494808435440063, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19787060469388962, "step": 11956 }, { "epoch": 0.17854396404333317, "grad_norm": 0.33203125, "grad_norm_var": 0.001196908950805664, "learning_rate": 0.0001, "loss": 1.7244, "loss/crossentropy": 2.315539002418518, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.26736222207546234, "step": 11957 }, { "epoch": 0.17855889621394813, "grad_norm": 0.443359375, "grad_norm_var": 0.0022005716959635417, "learning_rate": 0.0001, "loss": 1.6958, "loss/crossentropy": 2.7775161266326904, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2856312766671181, "step": 11958 }, { "epoch": 0.17857382838456312, "grad_norm": 0.3359375, "grad_norm_var": 0.0022005716959635417, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.48858904838562, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.19216693937778473, "step": 11959 }, { "epoch": 0.1785887605551781, "grad_norm": 0.43359375, "grad_norm_var": 0.0029357274373372397, "learning_rate": 0.0001, "loss": 1.6661, "loss/crossentropy": 2.3229414224624634, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2207871377468109, "step": 11960 }, { "epoch": 0.17860369272579307, "grad_norm": 0.30078125, "grad_norm_var": 0.0026382287343343098, "learning_rate": 0.0001, "loss": 1.514, "loss/crossentropy": 2.5598199367523193, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2249189093708992, "step": 11961 }, { "epoch": 0.17861862489640806, "grad_norm": 0.333984375, "grad_norm_var": 0.0026382287343343098, "learning_rate": 0.0001, "loss": 1.4908, "loss/crossentropy": 2.5781644582748413, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2173624411225319, "step": 11962 }, { "epoch": 0.17863355706702305, "grad_norm": 0.3203125, "grad_norm_var": 0.0025492191314697267, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.411870002746582, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.24325966835021973, "step": 11963 }, { "epoch": 0.17864848923763804, "grad_norm": 0.2890625, "grad_norm_var": 0.0025918960571289064, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.529338002204895, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20687483251094818, "step": 11964 }, { "epoch": 0.178663421408253, "grad_norm": 0.306640625, "grad_norm_var": 0.0024538516998291017, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.8738235235214233, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.23099465668201447, "step": 11965 }, { "epoch": 0.178678353578868, "grad_norm": 0.2734375, "grad_norm_var": 0.002529764175415039, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.5622891187667847, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18249498307704926, "step": 11966 }, { "epoch": 0.17869328574948298, "grad_norm": 0.318359375, "grad_norm_var": 0.0025194644927978515, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.711945056915283, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21974608302116394, "step": 11967 }, { "epoch": 0.17870821792009794, "grad_norm": 0.3125, "grad_norm_var": 0.002529255549112956, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.4039881229400635, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2078785076737404, "step": 11968 }, { "epoch": 0.17872315009071293, "grad_norm": 0.353515625, "grad_norm_var": 0.0025049845377604165, "learning_rate": 0.0001, "loss": 1.603, "loss/crossentropy": 2.5181084871292114, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.22413481026887894, "step": 11969 }, { "epoch": 0.17873808226132792, "grad_norm": 0.330078125, "grad_norm_var": 0.0024485111236572264, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.693565845489502, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20640375465154648, "step": 11970 }, { "epoch": 0.17875301443194289, "grad_norm": 0.302734375, "grad_norm_var": 0.0022617975870768228, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.5690040588378906, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19961068779230118, "step": 11971 }, { "epoch": 0.17876794660255788, "grad_norm": 0.400390625, "grad_norm_var": 0.0023991266886393228, "learning_rate": 0.0001, "loss": 1.7251, "loss/crossentropy": 2.646093726158142, "loss/fcd": 1.4765625, "loss/idx": 11.0, "loss/logits": 0.24857190251350403, "step": 11972 }, { "epoch": 0.17878287877317287, "grad_norm": 0.37890625, "grad_norm_var": 0.002507464090983073, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.6523369550704956, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21819882839918137, "step": 11973 }, { "epoch": 0.17879781094378785, "grad_norm": 0.36328125, "grad_norm_var": 0.001800394058227539, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.748814582824707, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2182694748044014, "step": 11974 }, { "epoch": 0.17881274311440282, "grad_norm": 0.310546875, "grad_norm_var": 0.0018361409505208333, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.5702465772628784, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18051201105117798, "step": 11975 }, { "epoch": 0.1788276752850178, "grad_norm": 0.310546875, "grad_norm_var": 0.00113218625386556, "learning_rate": 0.0001, "loss": 1.4039, "loss/crossentropy": 2.411879539489746, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18903642892837524, "step": 11976 }, { "epoch": 0.1788426074556328, "grad_norm": 0.287109375, "grad_norm_var": 0.0011885960896809895, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.556567430496216, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19484268128871918, "step": 11977 }, { "epoch": 0.17885753962624776, "grad_norm": 0.298828125, "grad_norm_var": 0.0012212117513020833, "learning_rate": 0.0001, "loss": 1.382, "loss/crossentropy": 2.605777382850647, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19841410964727402, "step": 11978 }, { "epoch": 0.17887247179686275, "grad_norm": 0.3125, "grad_norm_var": 0.0012270609537760417, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.221850872039795, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21190504729747772, "step": 11979 }, { "epoch": 0.17888740396747774, "grad_norm": 0.357421875, "grad_norm_var": 0.0012209415435791016, "learning_rate": 0.0001, "loss": 1.6991, "loss/crossentropy": 2.5929402112960815, "loss/fcd": 1.453125, "loss/idx": 11.0, "loss/logits": 0.24595776200294495, "step": 11980 }, { "epoch": 0.17890233613809273, "grad_norm": 0.33984375, "grad_norm_var": 0.00120391845703125, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.7083678245544434, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20450693368911743, "step": 11981 }, { "epoch": 0.1789172683087077, "grad_norm": 0.294921875, "grad_norm_var": 0.0010761102040608725, "learning_rate": 0.0001, "loss": 1.3618, "loss/crossentropy": 2.5261815786361694, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1899162381887436, "step": 11982 }, { "epoch": 0.17893220047932268, "grad_norm": 0.29296875, "grad_norm_var": 0.0011540095011393229, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.4815536737442017, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17486751079559326, "step": 11983 }, { "epoch": 0.17894713264993767, "grad_norm": 0.314453125, "grad_norm_var": 0.0011502424875895181, "learning_rate": 0.0001, "loss": 1.4922, "loss/crossentropy": 2.6155468225479126, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22270268946886063, "step": 11984 }, { "epoch": 0.17896206482055263, "grad_norm": 0.306640625, "grad_norm_var": 0.0011281172434488933, "learning_rate": 0.0001, "loss": 1.495, "loss/crossentropy": 2.6477781534194946, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2137952372431755, "step": 11985 }, { "epoch": 0.17897699699116762, "grad_norm": 0.314453125, "grad_norm_var": 0.001132949193318685, "learning_rate": 0.0001, "loss": 1.4155, "loss/crossentropy": 2.6893768310546875, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20453961938619614, "step": 11986 }, { "epoch": 0.1789919291617826, "grad_norm": 0.279296875, "grad_norm_var": 0.0012340386708577474, "learning_rate": 0.0001, "loss": 1.4012, "loss/crossentropy": 2.5052410364151, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19418759644031525, "step": 11987 }, { "epoch": 0.17900686133239757, "grad_norm": 0.291015625, "grad_norm_var": 0.0008477369944254557, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.5192095041275024, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17085476219654083, "step": 11988 }, { "epoch": 0.17902179350301256, "grad_norm": 0.34375, "grad_norm_var": 0.0006291548411051433, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.395011305809021, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21165404468774796, "step": 11989 }, { "epoch": 0.17903672567362755, "grad_norm": 0.310546875, "grad_norm_var": 0.00045363108317057293, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.679155707359314, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23390211164951324, "step": 11990 }, { "epoch": 0.17905165784424254, "grad_norm": 0.287109375, "grad_norm_var": 0.0004872004191080729, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.6209720373153687, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19483507424592972, "step": 11991 }, { "epoch": 0.1790665900148575, "grad_norm": 0.310546875, "grad_norm_var": 0.0004872004191080729, "learning_rate": 0.0001, "loss": 1.5241, "loss/crossentropy": 2.4027992486953735, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2194451540708542, "step": 11992 }, { "epoch": 0.1790815221854725, "grad_norm": 0.28125, "grad_norm_var": 0.0005063215891520182, "learning_rate": 0.0001, "loss": 1.4664, "loss/crossentropy": 2.3921645879745483, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22425799816846848, "step": 11993 }, { "epoch": 0.17909645435608748, "grad_norm": 0.271484375, "grad_norm_var": 0.0005882104237874349, "learning_rate": 0.0001, "loss": 1.3303, "loss/crossentropy": 2.6678719520568848, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17796553671360016, "step": 11994 }, { "epoch": 0.17911138652670244, "grad_norm": 0.33203125, "grad_norm_var": 0.0006269931793212891, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.726831555366516, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22045616805553436, "step": 11995 }, { "epoch": 0.17912631869731743, "grad_norm": 0.283203125, "grad_norm_var": 0.00048203468322753905, "learning_rate": 0.0001, "loss": 1.3655, "loss/crossentropy": 2.702968716621399, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19755978882312775, "step": 11996 }, { "epoch": 0.17914125086793242, "grad_norm": 0.28515625, "grad_norm_var": 0.0004028161366780599, "learning_rate": 0.0001, "loss": 1.4334, "loss/crossentropy": 2.5687698125839233, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20679871737957, "step": 11997 }, { "epoch": 0.1791561830385474, "grad_norm": 0.298828125, "grad_norm_var": 0.00040116310119628904, "learning_rate": 0.0001, "loss": 1.3126, "loss/crossentropy": 2.386738419532776, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16802439093589783, "step": 11998 }, { "epoch": 0.17917111520916237, "grad_norm": 0.29296875, "grad_norm_var": 0.00040116310119628904, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.496061325073242, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.197971872985363, "step": 11999 }, { "epoch": 0.17918604737977736, "grad_norm": 0.3125, "grad_norm_var": 0.00039768218994140625, "learning_rate": 0.0001, "loss": 1.4459, "loss/crossentropy": 2.779141068458557, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21540658175945282, "step": 12000 }, { "epoch": 0.17920097955039235, "grad_norm": 0.34765625, "grad_norm_var": 0.0005388736724853515, "learning_rate": 0.0001, "loss": 1.4808, "loss/crossentropy": 2.4793505668640137, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20739079266786575, "step": 12001 }, { "epoch": 0.17921591172100731, "grad_norm": 0.34375, "grad_norm_var": 0.0006387710571289062, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.722311854362488, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1851513534784317, "step": 12002 }, { "epoch": 0.1792308438916223, "grad_norm": 0.3046875, "grad_norm_var": 0.0005939324696858724, "learning_rate": 0.0001, "loss": 1.4064, "loss/crossentropy": 2.5206172466278076, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19157841056585312, "step": 12003 }, { "epoch": 0.1792457760622373, "grad_norm": 0.298828125, "grad_norm_var": 0.0005821069081624349, "learning_rate": 0.0001, "loss": 1.2554, "loss/crossentropy": 2.581490397453308, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1577470898628235, "step": 12004 }, { "epoch": 0.17926070823285226, "grad_norm": 0.322265625, "grad_norm_var": 0.000504302978515625, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.641525626182556, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1910627782344818, "step": 12005 }, { "epoch": 0.17927564040346725, "grad_norm": 0.296875, "grad_norm_var": 0.0005061944325764974, "learning_rate": 0.0001, "loss": 1.3661, "loss/crossentropy": 2.729470372200012, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18246746063232422, "step": 12006 }, { "epoch": 0.17929057257408224, "grad_norm": 0.341796875, "grad_norm_var": 0.0005676110585530598, "learning_rate": 0.0001, "loss": 1.5283, "loss/crossentropy": 2.5995575189590454, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.223636195063591, "step": 12007 }, { "epoch": 0.17930550474469722, "grad_norm": 0.30859375, "grad_norm_var": 0.0005671183268229166, "learning_rate": 0.0001, "loss": 1.5148, "loss/crossentropy": 2.3191981315612793, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.20619328320026398, "step": 12008 }, { "epoch": 0.1793204369153122, "grad_norm": 0.31640625, "grad_norm_var": 0.000520769755045573, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.635357618331909, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22361813485622406, "step": 12009 }, { "epoch": 0.17933536908592718, "grad_norm": 0.30859375, "grad_norm_var": 0.00041718482971191405, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.702152371406555, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.178685262799263, "step": 12010 }, { "epoch": 0.17935030125654217, "grad_norm": 0.287109375, "grad_norm_var": 0.0004241307576497396, "learning_rate": 0.0001, "loss": 1.4599, "loss/crossentropy": 2.6760571002960205, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21772851794958115, "step": 12011 }, { "epoch": 0.17936523342715713, "grad_norm": 0.294921875, "grad_norm_var": 0.00039189656575520834, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.626939058303833, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19945010542869568, "step": 12012 }, { "epoch": 0.17938016559777212, "grad_norm": 0.322265625, "grad_norm_var": 0.0003547509511311849, "learning_rate": 0.0001, "loss": 1.6408, "loss/crossentropy": 2.6571593284606934, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2697339653968811, "step": 12013 }, { "epoch": 0.1793950977683871, "grad_norm": 0.294921875, "grad_norm_var": 0.0003627618153889974, "learning_rate": 0.0001, "loss": 1.5094, "loss/crossentropy": 2.5112316608428955, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.20863117277622223, "step": 12014 }, { "epoch": 0.17941002993900207, "grad_norm": 0.291015625, "grad_norm_var": 0.0003679911295572917, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.490173101425171, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18961767107248306, "step": 12015 }, { "epoch": 0.17942496210961706, "grad_norm": 0.283203125, "grad_norm_var": 0.00041972796122233074, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.4514178037643433, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17262135446071625, "step": 12016 }, { "epoch": 0.17943989428023205, "grad_norm": 0.3359375, "grad_norm_var": 0.0003697554270426432, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.4738121032714844, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2086886167526245, "step": 12017 }, { "epoch": 0.17945482645084704, "grad_norm": 0.298828125, "grad_norm_var": 0.00029042561848958336, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.508456826210022, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.181756429374218, "step": 12018 }, { "epoch": 0.179469758621462, "grad_norm": 0.27734375, "grad_norm_var": 0.00034427642822265625, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.4228533506393433, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17558610439300537, "step": 12019 }, { "epoch": 0.179484690792077, "grad_norm": 0.30078125, "grad_norm_var": 0.0003429253896077474, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.7375375032424927, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17858754843473434, "step": 12020 }, { "epoch": 0.17949962296269198, "grad_norm": 0.30859375, "grad_norm_var": 0.00032323201497395834, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.582523465156555, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21441400051116943, "step": 12021 }, { "epoch": 0.17951455513330694, "grad_norm": 0.275390625, "grad_norm_var": 0.0003730614980061849, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.5212345123291016, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20794226229190826, "step": 12022 }, { "epoch": 0.17952948730392193, "grad_norm": 0.27734375, "grad_norm_var": 0.00029805501302083336, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.6328078508377075, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2031298279762268, "step": 12023 }, { "epoch": 0.17954441947453692, "grad_norm": 0.3203125, "grad_norm_var": 0.0003218968709309896, "learning_rate": 0.0001, "loss": 1.3611, "loss/crossentropy": 2.8671151399612427, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1892155259847641, "step": 12024 }, { "epoch": 0.1795593516451519, "grad_norm": 0.283203125, "grad_norm_var": 0.0003162225087483724, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.6377761363983154, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1861024647951126, "step": 12025 }, { "epoch": 0.17957428381576687, "grad_norm": 0.306640625, "grad_norm_var": 0.000313568115234375, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.6711848974227905, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.24271195381879807, "step": 12026 }, { "epoch": 0.17958921598638186, "grad_norm": 0.259765625, "grad_norm_var": 0.00039768218994140625, "learning_rate": 0.0001, "loss": 1.3016, "loss/crossentropy": 2.669067621231079, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1766097992658615, "step": 12027 }, { "epoch": 0.17960414815699685, "grad_norm": 0.318359375, "grad_norm_var": 0.00042972564697265627, "learning_rate": 0.0001, "loss": 1.4752, "loss/crossentropy": 2.6503862142562866, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2173846885561943, "step": 12028 }, { "epoch": 0.1796190803276118, "grad_norm": 0.33203125, "grad_norm_var": 0.0004684289296468099, "learning_rate": 0.0001, "loss": 1.5964, "loss/crossentropy": 2.520563006401062, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.23310458660125732, "step": 12029 }, { "epoch": 0.1796340124982268, "grad_norm": 0.3203125, "grad_norm_var": 0.0004992167154947917, "learning_rate": 0.0001, "loss": 1.5071, "loss/crossentropy": 2.5236958265304565, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21413088589906693, "step": 12030 }, { "epoch": 0.1796489446688418, "grad_norm": 0.337890625, "grad_norm_var": 0.0005846659342447917, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.79098117351532, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21697966009378433, "step": 12031 }, { "epoch": 0.17966387683945675, "grad_norm": 0.30859375, "grad_norm_var": 0.0005604902903238933, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.6444296836853027, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21338366717100143, "step": 12032 }, { "epoch": 0.17967880901007174, "grad_norm": 0.373046875, "grad_norm_var": 0.0008054097493489583, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.573426127433777, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.22655464708805084, "step": 12033 }, { "epoch": 0.17969374118068673, "grad_norm": 0.30078125, "grad_norm_var": 0.0008037408192952474, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.4408267736434937, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20883305370807648, "step": 12034 }, { "epoch": 0.17970867335130172, "grad_norm": 0.28515625, "grad_norm_var": 0.0007774194081624348, "learning_rate": 0.0001, "loss": 1.3585, "loss/crossentropy": 2.581026315689087, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18665944039821625, "step": 12035 }, { "epoch": 0.17972360552191669, "grad_norm": 0.34765625, "grad_norm_var": 0.0008773644765218099, "learning_rate": 0.0001, "loss": 1.5776, "loss/crossentropy": 2.284131169319153, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2299206703901291, "step": 12036 }, { "epoch": 0.17973853769253167, "grad_norm": 0.26171875, "grad_norm_var": 0.001021560033162435, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.5244204998016357, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17035988718271255, "step": 12037 }, { "epoch": 0.17975346986314666, "grad_norm": 0.263671875, "grad_norm_var": 0.0010791619618733723, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.5851024389266968, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1800358071923256, "step": 12038 }, { "epoch": 0.17976840203376163, "grad_norm": 0.3515625, "grad_norm_var": 0.0011395613352457683, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.810880422592163, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19378401339054108, "step": 12039 }, { "epoch": 0.17978333420437662, "grad_norm": 0.279296875, "grad_norm_var": 0.0011919657389322916, "learning_rate": 0.0001, "loss": 1.2704, "loss/crossentropy": 2.6603128910064697, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16105790436267853, "step": 12040 }, { "epoch": 0.1797982663749916, "grad_norm": 0.2451171875, "grad_norm_var": 0.0014090816179911296, "learning_rate": 0.0001, "loss": 1.3839, "loss/crossentropy": 2.420579433441162, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18861666321754456, "step": 12041 }, { "epoch": 0.1798131985456066, "grad_norm": 0.283203125, "grad_norm_var": 0.0014405528704325358, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.604268789291382, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16886019706726074, "step": 12042 }, { "epoch": 0.17982813071622156, "grad_norm": 0.263671875, "grad_norm_var": 0.0014183322588602703, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.534915566444397, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19545171409845352, "step": 12043 }, { "epoch": 0.17984306288683655, "grad_norm": 0.318359375, "grad_norm_var": 0.0014183322588602703, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.59607195854187, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22211259603500366, "step": 12044 }, { "epoch": 0.17985799505745154, "grad_norm": 0.28515625, "grad_norm_var": 0.0013836185137430827, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.5447516441345215, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1995256319642067, "step": 12045 }, { "epoch": 0.1798729272280665, "grad_norm": 0.30859375, "grad_norm_var": 0.001362923781077067, "learning_rate": 0.0001, "loss": 1.4479, "loss/crossentropy": 2.5785804986953735, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2018047571182251, "step": 12046 }, { "epoch": 0.1798878593986815, "grad_norm": 0.298828125, "grad_norm_var": 0.0012653311093648275, "learning_rate": 0.0001, "loss": 1.5152, "loss/crossentropy": 2.5218734741210938, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.24562468379735947, "step": 12047 }, { "epoch": 0.17990279156929648, "grad_norm": 0.2734375, "grad_norm_var": 0.0012947996457417807, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.365661144256592, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19058869779109955, "step": 12048 }, { "epoch": 0.17991772373991144, "grad_norm": 0.271484375, "grad_norm_var": 0.000898897647857666, "learning_rate": 0.0001, "loss": 1.3873, "loss/crossentropy": 2.721841335296631, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19195310026407242, "step": 12049 }, { "epoch": 0.17993265591052643, "grad_norm": 0.318359375, "grad_norm_var": 0.0009438157081604004, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.6344668865203857, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20185602456331253, "step": 12050 }, { "epoch": 0.17994758808114142, "grad_norm": 0.330078125, "grad_norm_var": 0.0010352094968159994, "learning_rate": 0.0001, "loss": 1.3428, "loss/crossentropy": 2.623986005783081, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18659768998622894, "step": 12051 }, { "epoch": 0.1799625202517564, "grad_norm": 0.37109375, "grad_norm_var": 0.001237960656483968, "learning_rate": 0.0001, "loss": 1.5374, "loss/crossentropy": 2.653836965560913, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.22492137551307678, "step": 12052 }, { "epoch": 0.17997745242237137, "grad_norm": 0.47265625, "grad_norm_var": 0.003076454003651937, "learning_rate": 0.0001, "loss": 1.9377, "loss/crossentropy": 2.3974400758743286, "loss/fcd": 1.5859375, "loss/idx": 11.0, "loss/logits": 0.3517506420612335, "step": 12053 }, { "epoch": 0.17999238459298636, "grad_norm": 0.328125, "grad_norm_var": 0.0029516180356343588, "learning_rate": 0.0001, "loss": 1.5467, "loss/crossentropy": 2.667269468307495, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.22633950412273407, "step": 12054 }, { "epoch": 0.18000731676360135, "grad_norm": 0.314453125, "grad_norm_var": 0.002844107151031494, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.3518396615982056, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1747216358780861, "step": 12055 }, { "epoch": 0.1800222489342163, "grad_norm": 0.326171875, "grad_norm_var": 0.0027887940406799315, "learning_rate": 0.0001, "loss": 1.6758, "loss/crossentropy": 2.604796886444092, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.25388824194669724, "step": 12056 }, { "epoch": 0.1800371811048313, "grad_norm": 0.349609375, "grad_norm_var": 0.0025247573852539063, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.449348568916321, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.18311776220798492, "step": 12057 }, { "epoch": 0.1800521132754463, "grad_norm": 0.330078125, "grad_norm_var": 0.0024347305297851562, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.649956226348877, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2521290183067322, "step": 12058 }, { "epoch": 0.18006704544606128, "grad_norm": 0.296875, "grad_norm_var": 0.002243153254191081, "learning_rate": 0.0001, "loss": 1.4327, "loss/crossentropy": 2.7300411462783813, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21396075934171677, "step": 12059 }, { "epoch": 0.18008197761667624, "grad_norm": 0.291015625, "grad_norm_var": 0.0023125807444254556, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.5488678216934204, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19445043057203293, "step": 12060 }, { "epoch": 0.18009690978729123, "grad_norm": 0.27734375, "grad_norm_var": 0.0023556868235270183, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.601121187210083, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.20438174158334732, "step": 12061 }, { "epoch": 0.18011184195790622, "grad_norm": 0.322265625, "grad_norm_var": 0.00234222412109375, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.5512311458587646, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18726776540279388, "step": 12062 }, { "epoch": 0.18012677412852118, "grad_norm": 0.287109375, "grad_norm_var": 0.0023889541625976562, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.356381058692932, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20431461185216904, "step": 12063 }, { "epoch": 0.18014170629913617, "grad_norm": 0.328125, "grad_norm_var": 0.0022180557250976564, "learning_rate": 0.0001, "loss": 1.6118, "loss/crossentropy": 2.783458709716797, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.25246090441942215, "step": 12064 }, { "epoch": 0.18015663846975116, "grad_norm": 0.294921875, "grad_norm_var": 0.0020822525024414063, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.562338709831238, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16739293187856674, "step": 12065 }, { "epoch": 0.18017157064036612, "grad_norm": 0.28125, "grad_norm_var": 0.0022130171457926434, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.580959439277649, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19347292929887772, "step": 12066 }, { "epoch": 0.18018650281098111, "grad_norm": 0.267578125, "grad_norm_var": 0.00241545041402181, "learning_rate": 0.0001, "loss": 1.3964, "loss/crossentropy": 2.532949447631836, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20104877650737762, "step": 12067 }, { "epoch": 0.1802014349815961, "grad_norm": 0.298828125, "grad_norm_var": 0.002260780334472656, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.6691583395004272, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20173272490501404, "step": 12068 }, { "epoch": 0.1802163671522111, "grad_norm": 0.337890625, "grad_norm_var": 0.0005926609039306641, "learning_rate": 0.0001, "loss": 1.5294, "loss/crossentropy": 2.5341379642486572, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.20520970225334167, "step": 12069 }, { "epoch": 0.18023129932282606, "grad_norm": 0.30078125, "grad_norm_var": 0.0005668481190999349, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.910756826400757, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20609359443187714, "step": 12070 }, { "epoch": 0.18024623149344104, "grad_norm": 0.357421875, "grad_norm_var": 0.000727701187133789, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.667574882507324, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19445863366127014, "step": 12071 }, { "epoch": 0.18026116366405603, "grad_norm": 0.287109375, "grad_norm_var": 0.0007346947987874349, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.6017807722091675, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21623952686786652, "step": 12072 }, { "epoch": 0.180276095834671, "grad_norm": 0.2734375, "grad_norm_var": 0.0006621678670247396, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.448145270347595, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19026409834623337, "step": 12073 }, { "epoch": 0.18029102800528599, "grad_norm": 0.310546875, "grad_norm_var": 0.0006128946940104166, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.744900107383728, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20020771026611328, "step": 12074 }, { "epoch": 0.18030596017590098, "grad_norm": 0.359375, "grad_norm_var": 0.0008244832356770833, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.6244006156921387, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.1987333819270134, "step": 12075 }, { "epoch": 0.18032089234651594, "grad_norm": 0.392578125, "grad_norm_var": 0.001284027099609375, "learning_rate": 0.0001, "loss": 1.6774, "loss/crossentropy": 2.432270884513855, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.279001384973526, "step": 12076 }, { "epoch": 0.18033582451713093, "grad_norm": 0.3203125, "grad_norm_var": 0.0012063980102539062, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.6409618854522705, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20359650999307632, "step": 12077 }, { "epoch": 0.18035075668774592, "grad_norm": 0.30859375, "grad_norm_var": 0.0012025038401285808, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.368492364883423, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.16572914272546768, "step": 12078 }, { "epoch": 0.1803656888583609, "grad_norm": 0.31640625, "grad_norm_var": 0.001155535380045573, "learning_rate": 0.0001, "loss": 1.5116, "loss/crossentropy": 2.6679911613464355, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2147480696439743, "step": 12079 }, { "epoch": 0.18038062102897587, "grad_norm": 0.306640625, "grad_norm_var": 0.0011459191640218098, "learning_rate": 0.0001, "loss": 1.2791, "loss/crossentropy": 2.634997248649597, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16187996417284012, "step": 12080 }, { "epoch": 0.18039555319959086, "grad_norm": 0.294921875, "grad_norm_var": 0.0011459191640218098, "learning_rate": 0.0001, "loss": 1.3453, "loss/crossentropy": 2.613271951675415, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1734388917684555, "step": 12081 }, { "epoch": 0.18041048537020585, "grad_norm": 0.310546875, "grad_norm_var": 0.0010741551717122395, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.7227118015289307, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19302423298358917, "step": 12082 }, { "epoch": 0.1804254175408208, "grad_norm": 0.29296875, "grad_norm_var": 0.0009532769521077474, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.718486189842224, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1962105631828308, "step": 12083 }, { "epoch": 0.1804403497114358, "grad_norm": 0.3046875, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.592361807823181, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.197029247879982, "step": 12084 }, { "epoch": 0.1804552818820508, "grad_norm": 0.34375, "grad_norm_var": 0.0009597619374593099, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.6991714239120483, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20140333473682404, "step": 12085 }, { "epoch": 0.18047021405266578, "grad_norm": 0.29296875, "grad_norm_var": 0.0009809970855712891, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.665563702583313, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2021094709634781, "step": 12086 }, { "epoch": 0.18048514622328074, "grad_norm": 0.263671875, "grad_norm_var": 0.001025247573852539, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.771255373954773, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.20620155334472656, "step": 12087 }, { "epoch": 0.18050007839389573, "grad_norm": 0.283203125, "grad_norm_var": 0.0010387261708577475, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.727379083633423, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1800658255815506, "step": 12088 }, { "epoch": 0.18051501056451072, "grad_norm": 0.306640625, "grad_norm_var": 0.0009417215983072917, "learning_rate": 0.0001, "loss": 1.5101, "loss/crossentropy": 2.7038530111312866, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2210131287574768, "step": 12089 }, { "epoch": 0.18052994273512568, "grad_norm": 0.34765625, "grad_norm_var": 0.0010157108306884765, "learning_rate": 0.0001, "loss": 1.4948, "loss/crossentropy": 2.615886092185974, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2291921079158783, "step": 12090 }, { "epoch": 0.18054487490574067, "grad_norm": 0.33203125, "grad_norm_var": 0.0009017785390218099, "learning_rate": 0.0001, "loss": 1.408, "loss/crossentropy": 2.6552551984786987, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20096101611852646, "step": 12091 }, { "epoch": 0.18055980707635566, "grad_norm": 0.3359375, "grad_norm_var": 0.000505828857421875, "learning_rate": 0.0001, "loss": 1.5503, "loss/crossentropy": 2.624066472053528, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2338847517967224, "step": 12092 }, { "epoch": 0.18057473924697062, "grad_norm": 0.26953125, "grad_norm_var": 0.0005975723266601563, "learning_rate": 0.0001, "loss": 1.3838, "loss/crossentropy": 2.7451040744781494, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1885235607624054, "step": 12093 }, { "epoch": 0.1805896714175856, "grad_norm": 0.28125, "grad_norm_var": 0.0006380716959635417, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.635090470314026, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17595291137695312, "step": 12094 }, { "epoch": 0.1806046035882006, "grad_norm": 0.2890625, "grad_norm_var": 0.0006438573201497396, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.6009644269943237, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18416665494441986, "step": 12095 }, { "epoch": 0.1806195357588156, "grad_norm": 0.3046875, "grad_norm_var": 0.0006432692209879557, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.662035584449768, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22714471817016602, "step": 12096 }, { "epoch": 0.18063446792943055, "grad_norm": 0.2578125, "grad_norm_var": 0.0007710138956705729, "learning_rate": 0.0001, "loss": 1.2647, "loss/crossentropy": 2.514790177345276, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.15918170660734177, "step": 12097 }, { "epoch": 0.18064940010004554, "grad_norm": 0.306640625, "grad_norm_var": 0.0007670084635416667, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.470771908760071, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22984831035137177, "step": 12098 }, { "epoch": 0.18066433227066053, "grad_norm": 0.3125, "grad_norm_var": 0.0007705052693684895, "learning_rate": 0.0001, "loss": 1.3538, "loss/crossentropy": 2.558596611022949, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17409836500883102, "step": 12099 }, { "epoch": 0.1806792644412755, "grad_norm": 0.330078125, "grad_norm_var": 0.0008198897043863933, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.4383978843688965, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19319384545087814, "step": 12100 }, { "epoch": 0.18069419661189048, "grad_norm": 0.28515625, "grad_norm_var": 0.0007207075754801433, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.5062676668167114, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1985037922859192, "step": 12101 }, { "epoch": 0.18070912878250547, "grad_norm": 0.32421875, "grad_norm_var": 0.0007527510325113933, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.5747017860412598, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21415270864963531, "step": 12102 }, { "epoch": 0.18072406095312046, "grad_norm": 0.275390625, "grad_norm_var": 0.0007016340891520183, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.6143343448638916, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19462217390537262, "step": 12103 }, { "epoch": 0.18073899312373543, "grad_norm": 0.31640625, "grad_norm_var": 0.0006846110026041667, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.4499597549438477, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20652519911527634, "step": 12104 }, { "epoch": 0.18075392529435041, "grad_norm": 0.3359375, "grad_norm_var": 0.0007458845774332682, "learning_rate": 0.0001, "loss": 1.5961, "loss/crossentropy": 2.516512632369995, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.2288791760802269, "step": 12105 }, { "epoch": 0.1807688574649654, "grad_norm": 0.482421875, "grad_norm_var": 0.0026201883951822917, "learning_rate": 0.0001, "loss": 1.7206, "loss/crossentropy": 2.437564969062805, "loss/fcd": 1.453125, "loss/idx": 11.0, "loss/logits": 0.26746220886707306, "step": 12106 }, { "epoch": 0.18078378963558037, "grad_norm": 0.341796875, "grad_norm_var": 0.0026484012603759767, "learning_rate": 0.0001, "loss": 1.5224, "loss/crossentropy": 2.72077476978302, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22944432497024536, "step": 12107 }, { "epoch": 0.18079872180619536, "grad_norm": 0.302734375, "grad_norm_var": 0.0026270548502604165, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.833166003227234, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1804768219590187, "step": 12108 }, { "epoch": 0.18081365397681035, "grad_norm": 0.279296875, "grad_norm_var": 0.0025757948557535806, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.6136633157730103, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1808985471725464, "step": 12109 }, { "epoch": 0.1808285861474253, "grad_norm": 0.287109375, "grad_norm_var": 0.002552286783854167, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.677868962287903, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2118133082985878, "step": 12110 }, { "epoch": 0.1808435183180403, "grad_norm": 0.283203125, "grad_norm_var": 0.0025742689768473307, "learning_rate": 0.0001, "loss": 1.3759, "loss/crossentropy": 2.6497349739074707, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18447773158550262, "step": 12111 }, { "epoch": 0.1808584504886553, "grad_norm": 0.326171875, "grad_norm_var": 0.0025761922200520835, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.5433177947998047, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20387482643127441, "step": 12112 }, { "epoch": 0.18087338265927028, "grad_norm": 0.251953125, "grad_norm_var": 0.0026233514149983724, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.44916033744812, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1849336475133896, "step": 12113 }, { "epoch": 0.18088831482988524, "grad_norm": 0.80078125, "grad_norm_var": 0.01732934315999349, "learning_rate": 0.0001, "loss": 1.4314, "loss/crossentropy": 2.318395733833313, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.15410130470991135, "step": 12114 }, { "epoch": 0.18090324700050023, "grad_norm": 0.37890625, "grad_norm_var": 0.017308807373046874, "learning_rate": 0.0001, "loss": 1.6238, "loss/crossentropy": 2.554311752319336, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.2527491897344589, "step": 12115 }, { "epoch": 0.18091817917111522, "grad_norm": 0.35546875, "grad_norm_var": 0.017281325658162434, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.9217723608016968, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23174965381622314, "step": 12116 }, { "epoch": 0.18093311134173018, "grad_norm": 0.2734375, "grad_norm_var": 0.017393859227498372, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.529698133468628, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16709107160568237, "step": 12117 }, { "epoch": 0.18094804351234517, "grad_norm": 0.298828125, "grad_norm_var": 0.01752465565999349, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.546529769897461, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18563692271709442, "step": 12118 }, { "epoch": 0.18096297568296016, "grad_norm": 0.3046875, "grad_norm_var": 0.01728933652242025, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.5183253288269043, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20490190386772156, "step": 12119 }, { "epoch": 0.18097790785357515, "grad_norm": 0.3046875, "grad_norm_var": 0.01735227902730306, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.4910587072372437, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1947886049747467, "step": 12120 }, { "epoch": 0.1809928400241901, "grad_norm": 0.3359375, "grad_norm_var": 0.01735227902730306, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.40767240524292, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19046467542648315, "step": 12121 }, { "epoch": 0.1810077721948051, "grad_norm": 0.26953125, "grad_norm_var": 0.016439247131347656, "learning_rate": 0.0001, "loss": 1.3252, "loss/crossentropy": 2.5209269523620605, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18070144951343536, "step": 12122 }, { "epoch": 0.1810227043654201, "grad_norm": 0.306640625, "grad_norm_var": 0.0164947509765625, "learning_rate": 0.0001, "loss": 1.4146, "loss/crossentropy": 2.7054537534713745, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20368129014968872, "step": 12123 }, { "epoch": 0.18103763653603505, "grad_norm": 0.31640625, "grad_norm_var": 0.016447687149047853, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.442170739173889, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1903904750943184, "step": 12124 }, { "epoch": 0.18105256870665004, "grad_norm": 0.326171875, "grad_norm_var": 0.016231775283813477, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.7108196020126343, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2207091599702835, "step": 12125 }, { "epoch": 0.18106750087726503, "grad_norm": 0.328125, "grad_norm_var": 0.016054534912109376, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.8129124641418457, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1999099999666214, "step": 12126 }, { "epoch": 0.18108243304788, "grad_norm": 0.2890625, "grad_norm_var": 0.01601128578186035, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.430072844028473, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2237728014588356, "step": 12127 }, { "epoch": 0.18109736521849498, "grad_norm": 0.3125, "grad_norm_var": 0.016051228841145834, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.607680916786194, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21264079958200455, "step": 12128 }, { "epoch": 0.18111229738910997, "grad_norm": 0.31640625, "grad_norm_var": 0.01554716428120931, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.781680464744568, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19382943958044052, "step": 12129 }, { "epoch": 0.18112722955972496, "grad_norm": 0.310546875, "grad_norm_var": 0.0007659276326497396, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.7455610036849976, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21052324771881104, "step": 12130 }, { "epoch": 0.18114216173033992, "grad_norm": 0.283203125, "grad_norm_var": 0.0005128065745035808, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.3324615955352783, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21019139140844345, "step": 12131 }, { "epoch": 0.1811570939009549, "grad_norm": 0.298828125, "grad_norm_var": 0.0003565470377604167, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.30527126789093, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22738133370876312, "step": 12132 }, { "epoch": 0.1811720260715699, "grad_norm": 0.330078125, "grad_norm_var": 0.00032105445861816404, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.2889111042022705, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.20686814188957214, "step": 12133 }, { "epoch": 0.18118695824218486, "grad_norm": 0.3046875, "grad_norm_var": 0.00031585693359375, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.6195831298828125, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19142884016036987, "step": 12134 }, { "epoch": 0.18120189041279985, "grad_norm": 0.353515625, "grad_norm_var": 0.0004394372304280599, "learning_rate": 0.0001, "loss": 1.6736, "loss/crossentropy": 2.4689719676971436, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.26347725093364716, "step": 12135 }, { "epoch": 0.18121682258341484, "grad_norm": 0.296875, "grad_norm_var": 0.0004504998524983724, "learning_rate": 0.0001, "loss": 1.3535, "loss/crossentropy": 2.55341374874115, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18551279604434967, "step": 12136 }, { "epoch": 0.1812317547540298, "grad_norm": 0.322265625, "grad_norm_var": 0.0004170099894205729, "learning_rate": 0.0001, "loss": 1.5619, "loss/crossentropy": 2.3167017698287964, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23772753775119781, "step": 12137 }, { "epoch": 0.1812466869246448, "grad_norm": 0.33984375, "grad_norm_var": 0.0003437678019205729, "learning_rate": 0.0001, "loss": 1.505, "loss/crossentropy": 2.566375494003296, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21207533031702042, "step": 12138 }, { "epoch": 0.18126161909525978, "grad_norm": 0.3359375, "grad_norm_var": 0.0003659407297770182, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.5568405389785767, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19647374004125595, "step": 12139 }, { "epoch": 0.18127655126587477, "grad_norm": 0.384765625, "grad_norm_var": 0.000656890869140625, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.6647955179214478, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.22216297686100006, "step": 12140 }, { "epoch": 0.18129148343648974, "grad_norm": 0.328125, "grad_norm_var": 0.0006585280100504557, "learning_rate": 0.0001, "loss": 1.4078, "loss/crossentropy": 2.6065629720687866, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20078642666339874, "step": 12141 }, { "epoch": 0.18130641560710473, "grad_norm": 0.306640625, "grad_norm_var": 0.0006667455037434896, "learning_rate": 0.0001, "loss": 1.4748, "loss/crossentropy": 2.6773793697357178, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20916780829429626, "step": 12142 }, { "epoch": 0.18132134777771972, "grad_norm": 0.310546875, "grad_norm_var": 0.0006081740061442058, "learning_rate": 0.0001, "loss": 1.5156, "loss/crossentropy": 2.435128331184387, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2382848709821701, "step": 12143 }, { "epoch": 0.18133627994833468, "grad_norm": 0.287109375, "grad_norm_var": 0.0006769816080729167, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.74042809009552, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.18358317017555237, "step": 12144 }, { "epoch": 0.18135121211894967, "grad_norm": 0.30078125, "grad_norm_var": 0.0006983439127604167, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.6285462379455566, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19865190237760544, "step": 12145 }, { "epoch": 0.18136614428956466, "grad_norm": 0.3046875, "grad_norm_var": 0.0007065931955973307, "learning_rate": 0.0001, "loss": 1.3981, "loss/crossentropy": 2.435762405395508, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18717944622039795, "step": 12146 }, { "epoch": 0.18138107646017965, "grad_norm": 0.287109375, "grad_norm_var": 0.0006894270579020182, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.6860015392303467, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21965563297271729, "step": 12147 }, { "epoch": 0.1813960086307946, "grad_norm": 0.369140625, "grad_norm_var": 0.0008164564768473307, "learning_rate": 0.0001, "loss": 1.9428, "loss/crossentropy": 2.3612221479415894, "loss/fcd": 1.609375, "loss/idx": 11.0, "loss/logits": 0.3334292024374008, "step": 12148 }, { "epoch": 0.1814109408014096, "grad_norm": 0.326171875, "grad_norm_var": 0.0008135318756103515, "learning_rate": 0.0001, "loss": 1.5477, "loss/crossentropy": 2.464871406555176, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2156851589679718, "step": 12149 }, { "epoch": 0.1814258729720246, "grad_norm": 0.283203125, "grad_norm_var": 0.0008930842081705729, "learning_rate": 0.0001, "loss": 1.3261, "loss/crossentropy": 2.667466878890991, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17762195318937302, "step": 12150 }, { "epoch": 0.18144080514263955, "grad_norm": 0.36328125, "grad_norm_var": 0.000941324234008789, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.7684237957000732, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20865106582641602, "step": 12151 }, { "epoch": 0.18145573731325454, "grad_norm": 0.3125, "grad_norm_var": 0.0009049574534098307, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.6651363372802734, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18726538121700287, "step": 12152 }, { "epoch": 0.18147066948386953, "grad_norm": 0.3046875, "grad_norm_var": 0.000925127665201823, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.7915338277816772, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2052517905831337, "step": 12153 }, { "epoch": 0.1814856016544845, "grad_norm": 0.30078125, "grad_norm_var": 0.000925127665201823, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.664823293685913, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1642782986164093, "step": 12154 }, { "epoch": 0.18150053382509948, "grad_norm": 0.31640625, "grad_norm_var": 0.0009051005045572917, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.668572783470154, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2027033045887947, "step": 12155 }, { "epoch": 0.18151546599571447, "grad_norm": 0.314453125, "grad_norm_var": 0.0005869547526041667, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.670764207839966, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21341363340616226, "step": 12156 }, { "epoch": 0.18153039816632946, "grad_norm": 0.30078125, "grad_norm_var": 0.0005802790323893229, "learning_rate": 0.0001, "loss": 1.3927, "loss/crossentropy": 2.4774447679519653, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18565960228443146, "step": 12157 }, { "epoch": 0.18154533033694442, "grad_norm": 0.30859375, "grad_norm_var": 0.0005791823069254558, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.9194196462631226, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19081097096204758, "step": 12158 }, { "epoch": 0.1815602625075594, "grad_norm": 0.318359375, "grad_norm_var": 0.0005815982818603516, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.573812961578369, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.16907737404108047, "step": 12159 }, { "epoch": 0.1815751946781744, "grad_norm": 0.306640625, "grad_norm_var": 0.0005396366119384766, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.6387840509414673, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19797592610120773, "step": 12160 }, { "epoch": 0.18159012684878936, "grad_norm": 0.291015625, "grad_norm_var": 0.000562286376953125, "learning_rate": 0.0001, "loss": 1.3077, "loss/crossentropy": 2.727265238761902, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1787760853767395, "step": 12161 }, { "epoch": 0.18160505901940435, "grad_norm": 0.267578125, "grad_norm_var": 0.0006894270579020182, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.52169930934906, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21297328174114227, "step": 12162 }, { "epoch": 0.18161999119001934, "grad_norm": 0.279296875, "grad_norm_var": 0.000717782974243164, "learning_rate": 0.0001, "loss": 1.3116, "loss/crossentropy": 2.4961971044540405, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17489783465862274, "step": 12163 }, { "epoch": 0.18163492336063433, "grad_norm": 0.330078125, "grad_norm_var": 0.0005060672760009765, "learning_rate": 0.0001, "loss": 1.4646, "loss/crossentropy": 2.4553773403167725, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.18331387639045715, "step": 12164 }, { "epoch": 0.1816498555312493, "grad_norm": 0.318359375, "grad_norm_var": 0.0004906813303629557, "learning_rate": 0.0001, "loss": 1.4839, "loss/crossentropy": 2.709581732749939, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22214390337467194, "step": 12165 }, { "epoch": 0.18166478770186428, "grad_norm": 0.287109375, "grad_norm_var": 0.0004791100819905599, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.860349416732788, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19121456146240234, "step": 12166 }, { "epoch": 0.18167971987247927, "grad_norm": 0.3671875, "grad_norm_var": 0.0005091190338134766, "learning_rate": 0.0001, "loss": 1.444, "loss/crossentropy": 2.561258554458618, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2057340443134308, "step": 12167 }, { "epoch": 0.18169465204309423, "grad_norm": 0.287109375, "grad_norm_var": 0.000533294677734375, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.486851453781128, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1673382967710495, "step": 12168 }, { "epoch": 0.18170958421370922, "grad_norm": 0.31640625, "grad_norm_var": 0.0005395889282226562, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.435237407684326, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18381042778491974, "step": 12169 }, { "epoch": 0.18172451638432421, "grad_norm": 0.306640625, "grad_norm_var": 0.000536966323852539, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.4237974882125854, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21472516655921936, "step": 12170 }, { "epoch": 0.18173944855493918, "grad_norm": 0.296875, "grad_norm_var": 0.000536966323852539, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.8400816917419434, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2150128409266472, "step": 12171 }, { "epoch": 0.18175438072555417, "grad_norm": 0.326171875, "grad_norm_var": 0.0005587100982666016, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.6788415908813477, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18539634346961975, "step": 12172 }, { "epoch": 0.18176931289616916, "grad_norm": 0.31640625, "grad_norm_var": 0.0005615075429280599, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.7893248796463013, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22071118652820587, "step": 12173 }, { "epoch": 0.18178424506678414, "grad_norm": 0.267578125, "grad_norm_var": 0.0006619771321614583, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.4154707193374634, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.16722970455884933, "step": 12174 }, { "epoch": 0.1817991772373991, "grad_norm": 0.26171875, "grad_norm_var": 0.0007629235585530599, "learning_rate": 0.0001, "loss": 1.2549, "loss/crossentropy": 2.612420678138733, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.16115260124206543, "step": 12175 }, { "epoch": 0.1818141094080141, "grad_norm": 0.2734375, "grad_norm_var": 0.0008096694946289062, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.616329073905945, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1833888217806816, "step": 12176 }, { "epoch": 0.18182904157862909, "grad_norm": 0.283203125, "grad_norm_var": 0.0008223851521809896, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.461353063583374, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17913280427455902, "step": 12177 }, { "epoch": 0.18184397374924405, "grad_norm": 0.357421875, "grad_norm_var": 0.0009496053059895833, "learning_rate": 0.0001, "loss": 1.5036, "loss/crossentropy": 2.53175950050354, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22238145768642426, "step": 12178 }, { "epoch": 0.18185890591985904, "grad_norm": 0.302734375, "grad_norm_var": 0.0009045918782552083, "learning_rate": 0.0001, "loss": 1.3412, "loss/crossentropy": 2.6800562143325806, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18102489411830902, "step": 12179 }, { "epoch": 0.18187383809047403, "grad_norm": 0.28515625, "grad_norm_var": 0.0008874098459879557, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.696714997291565, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20179231464862823, "step": 12180 }, { "epoch": 0.18188877026108902, "grad_norm": 0.3046875, "grad_norm_var": 0.000871721903483073, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.581338882446289, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22575293481349945, "step": 12181 }, { "epoch": 0.18190370243170398, "grad_norm": 0.2890625, "grad_norm_var": 0.0008679548899332682, "learning_rate": 0.0001, "loss": 1.3671, "loss/crossentropy": 2.694558620452881, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17957745492458344, "step": 12182 }, { "epoch": 0.18191863460231897, "grad_norm": 0.265625, "grad_norm_var": 0.0006381829579671223, "learning_rate": 0.0001, "loss": 1.336, "loss/crossentropy": 2.483632802963257, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18369130045175552, "step": 12183 }, { "epoch": 0.18193356677293396, "grad_norm": 0.310546875, "grad_norm_var": 0.0006439050038655598, "learning_rate": 0.0001, "loss": 1.3776, "loss/crossentropy": 2.9108394384384155, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19794929027557373, "step": 12184 }, { "epoch": 0.18194849894354892, "grad_norm": 0.275390625, "grad_norm_var": 0.0006469090779622396, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.8449290990829468, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1771959811449051, "step": 12185 }, { "epoch": 0.1819634311141639, "grad_norm": 0.30078125, "grad_norm_var": 0.0006400903065999348, "learning_rate": 0.0001, "loss": 1.4454, "loss/crossentropy": 2.5705467462539673, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20711451768875122, "step": 12186 }, { "epoch": 0.1819783632847789, "grad_norm": 0.29296875, "grad_norm_var": 0.000639963150024414, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.6270424127578735, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.2022826299071312, "step": 12187 }, { "epoch": 0.18199329545539386, "grad_norm": 0.376953125, "grad_norm_var": 0.0010152022043863932, "learning_rate": 0.0001, "loss": 1.508, "loss/crossentropy": 2.515295147895813, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.24240849167108536, "step": 12188 }, { "epoch": 0.18200822762600885, "grad_norm": 0.267578125, "grad_norm_var": 0.001042620340983073, "learning_rate": 0.0001, "loss": 1.2635, "loss/crossentropy": 2.488671660423279, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.15802672505378723, "step": 12189 }, { "epoch": 0.18202315979662384, "grad_norm": 0.30078125, "grad_norm_var": 0.0009915510813395183, "learning_rate": 0.0001, "loss": 1.2836, "loss/crossentropy": 2.9191887378692627, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16645360738039017, "step": 12190 }, { "epoch": 0.18203809196723883, "grad_norm": 0.287109375, "grad_norm_var": 0.000913238525390625, "learning_rate": 0.0001, "loss": 1.4651, "loss/crossentropy": 2.54477596282959, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21122203022241592, "step": 12191 }, { "epoch": 0.1820530241378538, "grad_norm": 0.322265625, "grad_norm_var": 0.0009001255035400391, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.3645952939987183, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.19277169555425644, "step": 12192 }, { "epoch": 0.18206795630846878, "grad_norm": 0.298828125, "grad_norm_var": 0.0008774916330973307, "learning_rate": 0.0001, "loss": 1.4233, "loss/crossentropy": 2.7295161485671997, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20455976575613022, "step": 12193 }, { "epoch": 0.18208288847908377, "grad_norm": 0.28515625, "grad_norm_var": 0.0006734212239583333, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.5164036750793457, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18228591233491898, "step": 12194 }, { "epoch": 0.18209782064969873, "grad_norm": 0.310546875, "grad_norm_var": 0.0006823221842447917, "learning_rate": 0.0001, "loss": 1.3836, "loss/crossentropy": 2.767544984817505, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1883191615343094, "step": 12195 }, { "epoch": 0.18211275282031372, "grad_norm": 0.33984375, "grad_norm_var": 0.0007731119791666666, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.644300103187561, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.23271295428276062, "step": 12196 }, { "epoch": 0.1821276849909287, "grad_norm": 0.291015625, "grad_norm_var": 0.0007794539133707683, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.6649112701416016, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20141851156949997, "step": 12197 }, { "epoch": 0.18214261716154367, "grad_norm": 0.26953125, "grad_norm_var": 0.0008341312408447265, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.6415499448776245, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19868340343236923, "step": 12198 }, { "epoch": 0.18215754933215866, "grad_norm": 0.3203125, "grad_norm_var": 0.0007727146148681641, "learning_rate": 0.0001, "loss": 1.4363, "loss/crossentropy": 2.447648525238037, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19409581273794174, "step": 12199 }, { "epoch": 0.18217248150277365, "grad_norm": 0.2734375, "grad_norm_var": 0.0008219401041666667, "learning_rate": 0.0001, "loss": 1.3445, "loss/crossentropy": 2.7001501321792603, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1921979859471321, "step": 12200 }, { "epoch": 0.18218741367338864, "grad_norm": 0.3046875, "grad_norm_var": 0.0007764021555582682, "learning_rate": 0.0001, "loss": 1.3766, "loss/crossentropy": 2.541016697883606, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19696053862571716, "step": 12201 }, { "epoch": 0.1822023458440036, "grad_norm": 0.4609375, "grad_norm_var": 0.0023404280344645183, "learning_rate": 0.0001, "loss": 1.8038, "loss/crossentropy": 2.4111658334732056, "loss/fcd": 1.56640625, "loss/idx": 11.0, "loss/logits": 0.23742330819368362, "step": 12202 }, { "epoch": 0.1822172780146186, "grad_norm": 0.306640625, "grad_norm_var": 0.0023162841796875, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.5927295684814453, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.17598596215248108, "step": 12203 }, { "epoch": 0.18223221018523358, "grad_norm": 0.314453125, "grad_norm_var": 0.0020314534505208332, "learning_rate": 0.0001, "loss": 1.4303, "loss/crossentropy": 2.6010236740112305, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19986887276172638, "step": 12204 }, { "epoch": 0.18224714235584855, "grad_norm": 0.287109375, "grad_norm_var": 0.001945940653483073, "learning_rate": 0.0001, "loss": 1.303, "loss/crossentropy": 2.5376840829849243, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17020948976278305, "step": 12205 }, { "epoch": 0.18226207452646354, "grad_norm": 0.263671875, "grad_norm_var": 0.0020815372467041016, "learning_rate": 0.0001, "loss": 1.3197, "loss/crossentropy": 2.542436957359314, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1790301278233528, "step": 12206 }, { "epoch": 0.18227700669707853, "grad_norm": 0.294921875, "grad_norm_var": 0.0020630995432535807, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.627702474594116, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17753517627716064, "step": 12207 }, { "epoch": 0.18229193886769351, "grad_norm": 0.28515625, "grad_norm_var": 0.0020833333333333333, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.6138041019439697, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.18209417909383774, "step": 12208 }, { "epoch": 0.18230687103830848, "grad_norm": 0.28125, "grad_norm_var": 0.0021209557851155597, "learning_rate": 0.0001, "loss": 1.3549, "loss/crossentropy": 2.54691743850708, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18697035312652588, "step": 12209 }, { "epoch": 0.18232180320892347, "grad_norm": 0.34765625, "grad_norm_var": 0.0021952152252197265, "learning_rate": 0.0001, "loss": 1.6264, "loss/crossentropy": 2.450567126274109, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.23575758934020996, "step": 12210 }, { "epoch": 0.18233673537953846, "grad_norm": 0.310546875, "grad_norm_var": 0.0021952152252197265, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.827709674835205, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2068021297454834, "step": 12211 }, { "epoch": 0.18235166755015342, "grad_norm": 0.283203125, "grad_norm_var": 0.0021661758422851563, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.5129276514053345, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1930723786354065, "step": 12212 }, { "epoch": 0.1823665997207684, "grad_norm": 0.3125, "grad_norm_var": 0.0021523634592692056, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.694450259208679, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21754294633865356, "step": 12213 }, { "epoch": 0.1823815318913834, "grad_norm": 0.265625, "grad_norm_var": 0.002172962824503581, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.48003613948822, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19592487066984177, "step": 12214 }, { "epoch": 0.18239646406199836, "grad_norm": 0.29296875, "grad_norm_var": 0.002171182632446289, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 2.567761540412903, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17930403351783752, "step": 12215 }, { "epoch": 0.18241139623261335, "grad_norm": 0.30859375, "grad_norm_var": 0.0020990848541259767, "learning_rate": 0.0001, "loss": 1.4489, "loss/crossentropy": 2.3457142114639282, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.17933039367198944, "step": 12216 }, { "epoch": 0.18242632840322834, "grad_norm": 0.27734375, "grad_norm_var": 0.00215605099995931, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.6182206869125366, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17875853925943375, "step": 12217 }, { "epoch": 0.18244126057384333, "grad_norm": 0.3125, "grad_norm_var": 0.00046245257059733075, "learning_rate": 0.0001, "loss": 1.7106, "loss/crossentropy": 2.424411654472351, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2652507349848747, "step": 12218 }, { "epoch": 0.1824561927444583, "grad_norm": 0.396484375, "grad_norm_var": 0.0010883172353108725, "learning_rate": 0.0001, "loss": 1.4984, "loss/crossentropy": 3.0111255645751953, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2288428694009781, "step": 12219 }, { "epoch": 0.18247112491507328, "grad_norm": 0.283203125, "grad_norm_var": 0.0010979811350504557, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.139983296394348, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1702091246843338, "step": 12220 }, { "epoch": 0.18248605708568827, "grad_norm": 0.322265625, "grad_norm_var": 0.0011140028635660806, "learning_rate": 0.0001, "loss": 1.4364, "loss/crossentropy": 2.5816138982772827, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19811908900737762, "step": 12221 }, { "epoch": 0.18250098925630323, "grad_norm": 0.306640625, "grad_norm_var": 0.0010076999664306641, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.5292524099349976, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20531971752643585, "step": 12222 }, { "epoch": 0.18251592142691822, "grad_norm": 0.3203125, "grad_norm_var": 0.0010136922200520834, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.9323514699935913, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1817086637020111, "step": 12223 }, { "epoch": 0.1825308535975332, "grad_norm": 0.349609375, "grad_norm_var": 0.0010886987050374348, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.7343201637268066, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.24122555553913116, "step": 12224 }, { "epoch": 0.1825457857681482, "grad_norm": 0.30859375, "grad_norm_var": 0.0010281721750895181, "learning_rate": 0.0001, "loss": 1.5208, "loss/crossentropy": 2.4231892824172974, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2122460976243019, "step": 12225 }, { "epoch": 0.18256071793876316, "grad_norm": 0.3046875, "grad_norm_var": 0.0009414513905843098, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.512668251991272, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21897423267364502, "step": 12226 }, { "epoch": 0.18257565010937815, "grad_norm": 0.267578125, "grad_norm_var": 0.001051950454711914, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.478440999984741, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1776987835764885, "step": 12227 }, { "epoch": 0.18259058227999314, "grad_norm": 0.291015625, "grad_norm_var": 0.0010309696197509765, "learning_rate": 0.0001, "loss": 1.5022, "loss/crossentropy": 2.3708914518356323, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2326885536313057, "step": 12228 }, { "epoch": 0.1826055144506081, "grad_norm": 0.302734375, "grad_norm_var": 0.001030413309733073, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.266615629196167, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.21335506439208984, "step": 12229 }, { "epoch": 0.1826204466212231, "grad_norm": 0.357421875, "grad_norm_var": 0.001052077611287435, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.7266740798950195, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18462401628494263, "step": 12230 }, { "epoch": 0.18263537879183808, "grad_norm": 0.337890625, "grad_norm_var": 0.00106048583984375, "learning_rate": 0.0001, "loss": 1.3909, "loss/crossentropy": 2.6098852157592773, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19557610899209976, "step": 12231 }, { "epoch": 0.18265031096245304, "grad_norm": 0.3515625, "grad_norm_var": 0.0011367162068684896, "learning_rate": 0.0001, "loss": 1.4349, "loss/crossentropy": 2.833623766899109, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20440345257520676, "step": 12232 }, { "epoch": 0.18266524313306803, "grad_norm": 0.345703125, "grad_norm_var": 0.0010571638743082682, "learning_rate": 0.0001, "loss": 1.5909, "loss/crossentropy": 2.393728017807007, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.23539405316114426, "step": 12233 }, { "epoch": 0.18268017530368302, "grad_norm": 0.30859375, "grad_norm_var": 0.0010632673899332683, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.6921733617782593, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.24092885851860046, "step": 12234 }, { "epoch": 0.182695107474298, "grad_norm": 0.353515625, "grad_norm_var": 0.0007527510325113933, "learning_rate": 0.0001, "loss": 1.5854, "loss/crossentropy": 2.32202011346817, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2299003005027771, "step": 12235 }, { "epoch": 0.18271003964491297, "grad_norm": 0.3125, "grad_norm_var": 0.0006647745768229167, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.765671491622925, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1789238005876541, "step": 12236 }, { "epoch": 0.18272497181552796, "grad_norm": 0.28125, "grad_norm_var": 0.0007645765940348308, "learning_rate": 0.0001, "loss": 1.3918, "loss/crossentropy": 2.5673173666000366, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19257496297359467, "step": 12237 }, { "epoch": 0.18273990398614295, "grad_norm": 0.36328125, "grad_norm_var": 0.0008738199869791667, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.7156689167022705, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2248230278491974, "step": 12238 }, { "epoch": 0.18275483615675792, "grad_norm": 0.34375, "grad_norm_var": 0.0009020487467447917, "learning_rate": 0.0001, "loss": 1.5423, "loss/crossentropy": 2.4325499534606934, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2298150733113289, "step": 12239 }, { "epoch": 0.1827697683273729, "grad_norm": 0.283203125, "grad_norm_var": 0.0009485244750976563, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.5717241764068604, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18154460936784744, "step": 12240 }, { "epoch": 0.1827847004979879, "grad_norm": 0.302734375, "grad_norm_var": 0.0009592533111572266, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.4454634189605713, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.16952405124902725, "step": 12241 }, { "epoch": 0.18279963266860288, "grad_norm": 0.322265625, "grad_norm_var": 0.00094451904296875, "learning_rate": 0.0001, "loss": 1.4526, "loss/crossentropy": 2.7121955156326294, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2142791748046875, "step": 12242 }, { "epoch": 0.18281456483921785, "grad_norm": 0.296875, "grad_norm_var": 0.0007921695709228515, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.5218899250030518, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19928891956806183, "step": 12243 }, { "epoch": 0.18282949700983284, "grad_norm": 0.259765625, "grad_norm_var": 0.0009829044342041016, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.495801091194153, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16913364827632904, "step": 12244 }, { "epoch": 0.18284442918044783, "grad_norm": 0.333984375, "grad_norm_var": 0.0009712060292561849, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.5366164445877075, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21515227854251862, "step": 12245 }, { "epoch": 0.1828593613510628, "grad_norm": 0.2578125, "grad_norm_var": 0.0011227925618489583, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.580242395401001, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19468872994184494, "step": 12246 }, { "epoch": 0.18287429352167778, "grad_norm": 0.29296875, "grad_norm_var": 0.0011173089345296223, "learning_rate": 0.0001, "loss": 1.4744, "loss/crossentropy": 2.6770169734954834, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.23216666281223297, "step": 12247 }, { "epoch": 0.18288922569229277, "grad_norm": 0.52734375, "grad_norm_var": 0.003949721654256185, "learning_rate": 0.0001, "loss": 1.4009, "loss/crossentropy": 2.473478078842163, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.17042884230613708, "step": 12248 }, { "epoch": 0.18290415786290773, "grad_norm": 0.28515625, "grad_norm_var": 0.004004414876302083, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.580607295036316, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17821519076824188, "step": 12249 }, { "epoch": 0.18291909003352272, "grad_norm": 0.275390625, "grad_norm_var": 0.004125197728474935, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.4893786907196045, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1930558979511261, "step": 12250 }, { "epoch": 0.1829340222041377, "grad_norm": 0.314453125, "grad_norm_var": 0.004036823908487956, "learning_rate": 0.0001, "loss": 1.4242, "loss/crossentropy": 2.6980408430099487, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20149962604045868, "step": 12251 }, { "epoch": 0.1829489543747527, "grad_norm": 0.375, "grad_norm_var": 0.004253498713175456, "learning_rate": 0.0001, "loss": 1.4729, "loss/crossentropy": 2.4897937774658203, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.18380045890808105, "step": 12252 }, { "epoch": 0.18296388654536766, "grad_norm": 0.263671875, "grad_norm_var": 0.004362932840983073, "learning_rate": 0.0001, "loss": 1.3493, "loss/crossentropy": 2.603541612625122, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1813194751739502, "step": 12253 }, { "epoch": 0.18297881871598265, "grad_norm": 0.267578125, "grad_norm_var": 0.004365269343058268, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.57503080368042, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.2044261395931244, "step": 12254 }, { "epoch": 0.18299375088659764, "grad_norm": 0.40234375, "grad_norm_var": 0.004823033014933268, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.488096594810486, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22069060802459717, "step": 12255 }, { "epoch": 0.1830086830572126, "grad_norm": 0.33984375, "grad_norm_var": 0.004773712158203125, "learning_rate": 0.0001, "loss": 1.6083, "loss/crossentropy": 2.5756194591522217, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.22158785164356232, "step": 12256 }, { "epoch": 0.1830236152278276, "grad_norm": 0.29296875, "grad_norm_var": 0.00480192502339681, "learning_rate": 0.0001, "loss": 1.5008, "loss/crossentropy": 2.6297802925109863, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.23127739131450653, "step": 12257 }, { "epoch": 0.18303854739844258, "grad_norm": 0.26171875, "grad_norm_var": 0.00500640869140625, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.794252395629883, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.17143632471561432, "step": 12258 }, { "epoch": 0.18305347956905754, "grad_norm": 0.36328125, "grad_norm_var": 0.005117734273274739, "learning_rate": 0.0001, "loss": 1.595, "loss/crossentropy": 2.3731424808502197, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.23172979056835175, "step": 12259 }, { "epoch": 0.18306841173967253, "grad_norm": 0.283203125, "grad_norm_var": 0.004965146382649739, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.600019097328186, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18024948239326477, "step": 12260 }, { "epoch": 0.18308334391028752, "grad_norm": 0.322265625, "grad_norm_var": 0.004953511555989583, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.5711487531661987, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2344396933913231, "step": 12261 }, { "epoch": 0.1830982760809025, "grad_norm": 0.294921875, "grad_norm_var": 0.004730335871378581, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.484723210334778, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18146444857120514, "step": 12262 }, { "epoch": 0.18311320825151747, "grad_norm": 0.283203125, "grad_norm_var": 0.004774920145670573, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.450544238090515, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22656891494989395, "step": 12263 }, { "epoch": 0.18312814042213246, "grad_norm": 0.337890625, "grad_norm_var": 0.0018316745758056641, "learning_rate": 0.0001, "loss": 1.58, "loss/crossentropy": 2.927404761314392, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.23629041761159897, "step": 12264 }, { "epoch": 0.18314307259274745, "grad_norm": 0.296875, "grad_norm_var": 0.0018011569976806641, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.685989499092102, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18171992897987366, "step": 12265 }, { "epoch": 0.18315800476336241, "grad_norm": 0.287109375, "grad_norm_var": 0.0017542362213134766, "learning_rate": 0.0001, "loss": 1.3457, "loss/crossentropy": 2.65142285823822, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1855550855398178, "step": 12266 }, { "epoch": 0.1831729369339774, "grad_norm": 0.3359375, "grad_norm_var": 0.0017911275227864584, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.7492398023605347, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.21660368889570236, "step": 12267 }, { "epoch": 0.1831878691045924, "grad_norm": 0.265625, "grad_norm_var": 0.0016344706217447916, "learning_rate": 0.0001, "loss": 1.3467, "loss/crossentropy": 2.6222182512283325, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19041141122579575, "step": 12268 }, { "epoch": 0.18320280127520738, "grad_norm": 0.28125, "grad_norm_var": 0.0015542189280192058, "learning_rate": 0.0001, "loss": 1.3763, "loss/crossentropy": 2.354643225669861, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18490920960903168, "step": 12269 }, { "epoch": 0.18321773344582234, "grad_norm": 0.279296875, "grad_norm_var": 0.0015008131663004558, "learning_rate": 0.0001, "loss": 1.4482, "loss/crossentropy": 2.553363561630249, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2137906402349472, "step": 12270 }, { "epoch": 0.18323266561643733, "grad_norm": 0.318359375, "grad_norm_var": 0.000885009765625, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.6645587682724, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20197217911481857, "step": 12271 }, { "epoch": 0.18324759778705232, "grad_norm": 0.2890625, "grad_norm_var": 0.0007949193318684896, "learning_rate": 0.0001, "loss": 1.3793, "loss/crossentropy": 2.733131766319275, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1800600215792656, "step": 12272 }, { "epoch": 0.1832625299576673, "grad_norm": 0.3046875, "grad_norm_var": 0.0007932027180989583, "learning_rate": 0.0001, "loss": 1.4033, "loss/crossentropy": 2.607498049736023, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19631795585155487, "step": 12273 }, { "epoch": 0.18327746212828228, "grad_norm": 0.337890625, "grad_norm_var": 0.0007640679677327473, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.5796992778778076, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18682269006967545, "step": 12274 }, { "epoch": 0.18329239429889727, "grad_norm": 0.287109375, "grad_norm_var": 0.0005353291829427084, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.463024854660034, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18004101514816284, "step": 12275 }, { "epoch": 0.18330732646951223, "grad_norm": 0.33984375, "grad_norm_var": 0.0006067752838134766, "learning_rate": 0.0001, "loss": 1.5338, "loss/crossentropy": 2.601162552833557, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22911139577627182, "step": 12276 }, { "epoch": 0.18332225864012722, "grad_norm": 0.318359375, "grad_norm_var": 0.0005981286366780598, "learning_rate": 0.0001, "loss": 1.4459, "loss/crossentropy": 2.5967869758605957, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2037104070186615, "step": 12277 }, { "epoch": 0.1833371908107422, "grad_norm": 0.359375, "grad_norm_var": 0.0007832845052083334, "learning_rate": 0.0001, "loss": 1.6901, "loss/crossentropy": 2.4917173385620117, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.27994902431964874, "step": 12278 }, { "epoch": 0.1833521229813572, "grad_norm": 0.28515625, "grad_norm_var": 0.0007771650950113932, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.7063885927200317, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.2004680559039116, "step": 12279 }, { "epoch": 0.18336705515197216, "grad_norm": 0.296875, "grad_norm_var": 0.0007174173990885417, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.7315319776535034, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.24901816248893738, "step": 12280 }, { "epoch": 0.18338198732258715, "grad_norm": 0.384765625, "grad_norm_var": 0.0011029402414957682, "learning_rate": 0.0001, "loss": 1.6219, "loss/crossentropy": 2.519579529762268, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.21170980483293533, "step": 12281 }, { "epoch": 0.18339691949320214, "grad_norm": 0.3203125, "grad_norm_var": 0.0010675430297851563, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.714478850364685, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17761188000440598, "step": 12282 }, { "epoch": 0.1834118516638171, "grad_norm": 0.322265625, "grad_norm_var": 0.0010369459788004557, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.7820242643356323, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21495110541582108, "step": 12283 }, { "epoch": 0.1834267838344321, "grad_norm": 0.265625, "grad_norm_var": 0.0010369459788004557, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.533533811569214, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20584923028945923, "step": 12284 }, { "epoch": 0.18344171600504708, "grad_norm": 0.31640625, "grad_norm_var": 0.0009705702463785808, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.6855599880218506, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19121171534061432, "step": 12285 }, { "epoch": 0.18345664817566207, "grad_norm": 0.283203125, "grad_norm_var": 0.0009534041086832683, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.646255135536194, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20547574013471603, "step": 12286 }, { "epoch": 0.18347158034627703, "grad_norm": 0.298828125, "grad_norm_var": 0.0009667555491129558, "learning_rate": 0.0001, "loss": 1.5139, "loss/crossentropy": 2.272765874862671, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20921918004751205, "step": 12287 }, { "epoch": 0.18348651251689202, "grad_norm": 0.353515625, "grad_norm_var": 0.001019732157389323, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.621068596839905, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.2410702332854271, "step": 12288 }, { "epoch": 0.183501444687507, "grad_norm": 0.283203125, "grad_norm_var": 0.0010842482248942058, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.548197627067566, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2042638137936592, "step": 12289 }, { "epoch": 0.18351637685812197, "grad_norm": 0.33203125, "grad_norm_var": 0.0010691324869791666, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.519507646560669, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21426933258771896, "step": 12290 }, { "epoch": 0.18353130902873696, "grad_norm": 0.49609375, "grad_norm_var": 0.003009653091430664, "learning_rate": 0.0001, "loss": 1.623, "loss/crossentropy": 2.609705686569214, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2597544491291046, "step": 12291 }, { "epoch": 0.18354624119935195, "grad_norm": 0.3125, "grad_norm_var": 0.003014993667602539, "learning_rate": 0.0001, "loss": 1.6072, "loss/crossentropy": 2.537282109260559, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2634412348270416, "step": 12292 }, { "epoch": 0.1835611733699669, "grad_norm": 0.302734375, "grad_norm_var": 0.003047800064086914, "learning_rate": 0.0001, "loss": 1.3221, "loss/crossentropy": 2.7018022537231445, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17754900455474854, "step": 12293 }, { "epoch": 0.1835761055405819, "grad_norm": 0.388671875, "grad_norm_var": 0.003232574462890625, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.9396387338638306, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20611542463302612, "step": 12294 }, { "epoch": 0.1835910377111969, "grad_norm": 0.28125, "grad_norm_var": 0.0032556533813476564, "learning_rate": 0.0001, "loss": 1.5129, "loss/crossentropy": 2.497485399246216, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22770122438669205, "step": 12295 }, { "epoch": 0.18360596988181188, "grad_norm": 0.31640625, "grad_norm_var": 0.0032000223795572917, "learning_rate": 0.0001, "loss": 1.4987, "loss/crossentropy": 2.49999463558197, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.20181968808174133, "step": 12296 }, { "epoch": 0.18362090205242684, "grad_norm": 0.26171875, "grad_norm_var": 0.0032250563303629558, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.685586452484131, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18302636593580246, "step": 12297 }, { "epoch": 0.18363583422304183, "grad_norm": 0.306640625, "grad_norm_var": 0.0032378514607747395, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.7182202339172363, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.257392942905426, "step": 12298 }, { "epoch": 0.18365076639365682, "grad_norm": 0.283203125, "grad_norm_var": 0.0033217748006184895, "learning_rate": 0.0001, "loss": 1.3579, "loss/crossentropy": 2.5378949642181396, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19772187620401382, "step": 12299 }, { "epoch": 0.18366569856427178, "grad_norm": 0.33203125, "grad_norm_var": 0.0031369527180989585, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.889323592185974, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22487416863441467, "step": 12300 }, { "epoch": 0.18368063073488677, "grad_norm": 0.333984375, "grad_norm_var": 0.0031436761220296223, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.5206249952316284, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22362924367189407, "step": 12301 }, { "epoch": 0.18369556290550176, "grad_norm": 0.2890625, "grad_norm_var": 0.0031148274739583332, "learning_rate": 0.0001, "loss": 1.3587, "loss/crossentropy": 2.535431981086731, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1751381754875183, "step": 12302 }, { "epoch": 0.18371049507611675, "grad_norm": 0.279296875, "grad_norm_var": 0.0032022476196289064, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.586468458175659, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18808357417583466, "step": 12303 }, { "epoch": 0.18372542724673172, "grad_norm": 0.296875, "grad_norm_var": 0.0031649112701416016, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.7447712421417236, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.21234235167503357, "step": 12304 }, { "epoch": 0.1837403594173467, "grad_norm": 0.349609375, "grad_norm_var": 0.003128163019816081, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.5067782402038574, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21243131905794144, "step": 12305 }, { "epoch": 0.1837552915879617, "grad_norm": 0.359375, "grad_norm_var": 0.0032091617584228517, "learning_rate": 0.0001, "loss": 1.4839, "loss/crossentropy": 2.5773518085479736, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2182677611708641, "step": 12306 }, { "epoch": 0.18377022375857666, "grad_norm": 0.296875, "grad_norm_var": 0.001127481460571289, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.6642106771469116, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1909545511007309, "step": 12307 }, { "epoch": 0.18378515592919165, "grad_norm": 0.296875, "grad_norm_var": 0.0011414686838785808, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.570752501487732, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1962806135416031, "step": 12308 }, { "epoch": 0.18380008809980664, "grad_norm": 0.29296875, "grad_norm_var": 0.0011580785115559895, "learning_rate": 0.0001, "loss": 1.4003, "loss/crossentropy": 2.5657418966293335, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20108621567487717, "step": 12309 }, { "epoch": 0.1838150202704216, "grad_norm": 0.3203125, "grad_norm_var": 0.0007358392079671224, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.525952100753784, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20545966178178787, "step": 12310 }, { "epoch": 0.1838299524410366, "grad_norm": 0.2578125, "grad_norm_var": 0.0008476098378499349, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.4689847230911255, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17686593532562256, "step": 12311 }, { "epoch": 0.18384488461165158, "grad_norm": 0.306640625, "grad_norm_var": 0.0008381525675455729, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.7216224670410156, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20457132905721664, "step": 12312 }, { "epoch": 0.18385981678226657, "grad_norm": 0.294921875, "grad_norm_var": 0.000720071792602539, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.7817225456237793, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18637842684984207, "step": 12313 }, { "epoch": 0.18387474895288153, "grad_norm": 0.361328125, "grad_norm_var": 0.0009114424387613933, "learning_rate": 0.0001, "loss": 1.5381, "loss/crossentropy": 2.1163232922554016, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.20215925574302673, "step": 12314 }, { "epoch": 0.18388968112349652, "grad_norm": 0.291015625, "grad_norm_var": 0.000887918472290039, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.6862025260925293, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2057413086295128, "step": 12315 }, { "epoch": 0.1839046132941115, "grad_norm": 0.27734375, "grad_norm_var": 0.0009137312571207682, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.5351598262786865, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2125166952610016, "step": 12316 }, { "epoch": 0.18391954546472647, "grad_norm": 0.302734375, "grad_norm_var": 0.0008603254954020182, "learning_rate": 0.0001, "loss": 1.3866, "loss/crossentropy": 2.6945929527282715, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18738018721342087, "step": 12317 }, { "epoch": 0.18393447763534146, "grad_norm": 0.310546875, "grad_norm_var": 0.0008447647094726562, "learning_rate": 0.0001, "loss": 1.2342, "loss/crossentropy": 2.624473214149475, "loss/fcd": 1.078125, "loss/idx": 11.0, "loss/logits": 0.15609024465084076, "step": 12318 }, { "epoch": 0.18394940980595645, "grad_norm": 0.30859375, "grad_norm_var": 0.0007944583892822265, "learning_rate": 0.0001, "loss": 1.4302, "loss/crossentropy": 2.5638829469680786, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19978077709674835, "step": 12319 }, { "epoch": 0.1839643419765714, "grad_norm": 0.3125, "grad_norm_var": 0.0007870833079020183, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.58806312084198, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18398816138505936, "step": 12320 }, { "epoch": 0.1839792741471864, "grad_norm": 0.3203125, "grad_norm_var": 0.000680987040201823, "learning_rate": 0.0001, "loss": 1.3392, "loss/crossentropy": 2.6203495264053345, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1751435175538063, "step": 12321 }, { "epoch": 0.1839942063178014, "grad_norm": 0.275390625, "grad_norm_var": 0.0005340417226155599, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.47620165348053, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.22824188321828842, "step": 12322 }, { "epoch": 0.18400913848841638, "grad_norm": 0.3046875, "grad_norm_var": 0.0005328973134358724, "learning_rate": 0.0001, "loss": 1.4564, "loss/crossentropy": 2.511858820915222, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21422339975833893, "step": 12323 }, { "epoch": 0.18402407065903134, "grad_norm": 0.431640625, "grad_norm_var": 0.0015736897786458334, "learning_rate": 0.0001, "loss": 1.8034, "loss/crossentropy": 2.604287266731262, "loss/fcd": 1.515625, "loss/idx": 11.0, "loss/logits": 0.2877565547823906, "step": 12324 }, { "epoch": 0.18403900282964633, "grad_norm": 0.30078125, "grad_norm_var": 0.0015591939290364584, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.0777706503868103, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19806187599897385, "step": 12325 }, { "epoch": 0.18405393500026132, "grad_norm": 0.279296875, "grad_norm_var": 0.001613601048787435, "learning_rate": 0.0001, "loss": 1.2923, "loss/crossentropy": 2.4552345275878906, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1673496589064598, "step": 12326 }, { "epoch": 0.18406886717087628, "grad_norm": 0.375, "grad_norm_var": 0.0016803582509358724, "learning_rate": 0.0001, "loss": 1.7093, "loss/crossentropy": 2.511728048324585, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.2483953982591629, "step": 12327 }, { "epoch": 0.18408379934149127, "grad_norm": 0.33984375, "grad_norm_var": 0.0017087300618489583, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.5708796977996826, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21898118406534195, "step": 12328 }, { "epoch": 0.18409873151210626, "grad_norm": 0.41015625, "grad_norm_var": 0.0021860599517822266, "learning_rate": 0.0001, "loss": 1.9559, "loss/crossentropy": 2.400828719139099, "loss/fcd": 1.5859375, "loss/idx": 11.0, "loss/logits": 0.3699137344956398, "step": 12329 }, { "epoch": 0.18411366368272125, "grad_norm": 0.265625, "grad_norm_var": 0.0022958755493164063, "learning_rate": 0.0001, "loss": 1.2762, "loss/crossentropy": 2.6771914958953857, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.15902334451675415, "step": 12330 }, { "epoch": 0.1841285958533362, "grad_norm": 0.341796875, "grad_norm_var": 0.0022669474283854166, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.827772378921509, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18790820240974426, "step": 12331 }, { "epoch": 0.1841435280239512, "grad_norm": 0.3203125, "grad_norm_var": 0.002124977111816406, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.5254770517349243, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.18458709865808487, "step": 12332 }, { "epoch": 0.1841584601945662, "grad_norm": 0.306640625, "grad_norm_var": 0.0021143595377604166, "learning_rate": 0.0001, "loss": 1.4582, "loss/crossentropy": 2.680196166038513, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20817158371210098, "step": 12333 }, { "epoch": 0.18417339236518115, "grad_norm": 0.28515625, "grad_norm_var": 0.0022042433420817057, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.724309802055359, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1856372356414795, "step": 12334 }, { "epoch": 0.18418832453579614, "grad_norm": 0.392578125, "grad_norm_var": 0.002476946512858073, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.690287232398987, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18000411987304688, "step": 12335 }, { "epoch": 0.18420325670641113, "grad_norm": 0.322265625, "grad_norm_var": 0.0024616082509358725, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.5057300329208374, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.16895830631256104, "step": 12336 }, { "epoch": 0.1842181888770261, "grad_norm": 0.296875, "grad_norm_var": 0.002524550755818685, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.6881613731384277, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17661282420158386, "step": 12337 }, { "epoch": 0.18423312104764109, "grad_norm": 0.30078125, "grad_norm_var": 0.0023867289225260415, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.7466148138046265, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2183382660150528, "step": 12338 }, { "epoch": 0.18424805321825607, "grad_norm": 0.283203125, "grad_norm_var": 0.0024869124094645183, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.676753878593445, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19966397434473038, "step": 12339 }, { "epoch": 0.18426298538887106, "grad_norm": 0.427734375, "grad_norm_var": 0.0024340152740478516, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.5492461919784546, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.18220862746238708, "step": 12340 }, { "epoch": 0.18427791755948603, "grad_norm": 0.38671875, "grad_norm_var": 0.0025836785634358725, "learning_rate": 0.0001, "loss": 1.3952, "loss/crossentropy": 2.675367832183838, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19986801594495773, "step": 12341 }, { "epoch": 0.18429284973010102, "grad_norm": 0.298828125, "grad_norm_var": 0.002466694513956706, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.722944736480713, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19917186349630356, "step": 12342 }, { "epoch": 0.184307781900716, "grad_norm": 0.275390625, "grad_norm_var": 0.002550188700358073, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.652132034301758, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17853953689336777, "step": 12343 }, { "epoch": 0.18432271407133097, "grad_norm": 0.283203125, "grad_norm_var": 0.002664041519165039, "learning_rate": 0.0001, "loss": 1.3139, "loss/crossentropy": 2.6531288623809814, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17330553382635117, "step": 12344 }, { "epoch": 0.18433764624194596, "grad_norm": 0.390625, "grad_norm_var": 0.002465677261352539, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.6412601470947266, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18591416627168655, "step": 12345 }, { "epoch": 0.18435257841256095, "grad_norm": 0.37109375, "grad_norm_var": 0.0023455142974853514, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.59296977519989, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.23052388429641724, "step": 12346 }, { "epoch": 0.18436751058317594, "grad_norm": 0.314453125, "grad_norm_var": 0.002349964777628581, "learning_rate": 0.0001, "loss": 1.304, "loss/crossentropy": 2.769202947616577, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17121250927448273, "step": 12347 }, { "epoch": 0.1843824427537909, "grad_norm": 0.279296875, "grad_norm_var": 0.002499834696451823, "learning_rate": 0.0001, "loss": 1.3061, "loss/crossentropy": 2.6455732583999634, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1732931211590767, "step": 12348 }, { "epoch": 0.1843973749244059, "grad_norm": 0.314453125, "grad_norm_var": 0.002483558654785156, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.5374042987823486, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21098296344280243, "step": 12349 }, { "epoch": 0.18441230709502088, "grad_norm": 0.30078125, "grad_norm_var": 0.002412859598795573, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.37766695022583, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16450528800487518, "step": 12350 }, { "epoch": 0.18442723926563584, "grad_norm": 0.328125, "grad_norm_var": 0.002112309137980143, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.5643153190612793, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21672095358371735, "step": 12351 }, { "epoch": 0.18444217143625083, "grad_norm": 0.279296875, "grad_norm_var": 0.002233997980753581, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.806441307067871, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19090330600738525, "step": 12352 }, { "epoch": 0.18445710360686582, "grad_norm": 0.296875, "grad_norm_var": 0.002233997980753581, "learning_rate": 0.0001, "loss": 1.4442, "loss/crossentropy": 2.43008816242218, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21372021734714508, "step": 12353 }, { "epoch": 0.18447203577748078, "grad_norm": 0.275390625, "grad_norm_var": 0.0023416519165039063, "learning_rate": 0.0001, "loss": 1.3454, "loss/crossentropy": 2.6112990379333496, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18138083070516586, "step": 12354 }, { "epoch": 0.18448696794809577, "grad_norm": 0.265625, "grad_norm_var": 0.002445077896118164, "learning_rate": 0.0001, "loss": 1.3882, "loss/crossentropy": 2.5200228691101074, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19289329648017883, "step": 12355 }, { "epoch": 0.18450190011871076, "grad_norm": 0.287109375, "grad_norm_var": 0.001623392105102539, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.594268560409546, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16424604505300522, "step": 12356 }, { "epoch": 0.18451683228932575, "grad_norm": 0.30859375, "grad_norm_var": 0.0011974175771077473, "learning_rate": 0.0001, "loss": 1.4952, "loss/crossentropy": 2.5371453762054443, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21004829555749893, "step": 12357 }, { "epoch": 0.1845317644599407, "grad_norm": 0.3203125, "grad_norm_var": 0.0012105305989583333, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.276554822921753, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2137320712208748, "step": 12358 }, { "epoch": 0.1845466966305557, "grad_norm": 0.328125, "grad_norm_var": 0.0011714776357014975, "learning_rate": 0.0001, "loss": 1.7209, "loss/crossentropy": 2.5441216230392456, "loss/fcd": 1.44140625, "loss/idx": 11.0, "loss/logits": 0.279541015625, "step": 12359 }, { "epoch": 0.1845616288011707, "grad_norm": 0.259765625, "grad_norm_var": 0.0012863000233968099, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.548168659210205, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17898950725793839, "step": 12360 }, { "epoch": 0.18457656097178565, "grad_norm": 0.283203125, "grad_norm_var": 0.0008168538411458333, "learning_rate": 0.0001, "loss": 1.3685, "loss/crossentropy": 2.615713357925415, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19268371164798737, "step": 12361 }, { "epoch": 0.18459149314240064, "grad_norm": 0.2734375, "grad_norm_var": 0.0004973729451497396, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.589626669883728, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.21130888164043427, "step": 12362 }, { "epoch": 0.18460642531301563, "grad_norm": 0.404296875, "grad_norm_var": 0.0012387593587239584, "learning_rate": 0.0001, "loss": 1.4562, "loss/crossentropy": 2.6164907217025757, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19837477803230286, "step": 12363 }, { "epoch": 0.18462135748363062, "grad_norm": 0.69140625, "grad_norm_var": 0.010699701309204102, "learning_rate": 0.0001, "loss": 1.6359, "loss/crossentropy": 2.656423330307007, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.30779364705085754, "step": 12364 }, { "epoch": 0.18463628965424558, "grad_norm": 0.375, "grad_norm_var": 0.010835202534993489, "learning_rate": 0.0001, "loss": 1.8448, "loss/crossentropy": 2.559885859489441, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.34092575311660767, "step": 12365 }, { "epoch": 0.18465122182486057, "grad_norm": 0.365234375, "grad_norm_var": 0.010845168431599935, "learning_rate": 0.0001, "loss": 1.7115, "loss/crossentropy": 2.532568335533142, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.266159288585186, "step": 12366 }, { "epoch": 0.18466615399547556, "grad_norm": 0.34375, "grad_norm_var": 0.010848474502563477, "learning_rate": 0.0001, "loss": 1.4733, "loss/crossentropy": 2.874277710914612, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.22327370941638947, "step": 12367 }, { "epoch": 0.18468108616609052, "grad_norm": 0.291015625, "grad_norm_var": 0.010770273208618165, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.7296667098999023, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19877924025058746, "step": 12368 }, { "epoch": 0.18469601833670551, "grad_norm": 0.314453125, "grad_norm_var": 0.010698890686035157, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.3908270597457886, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20739082992076874, "step": 12369 }, { "epoch": 0.1847109505073205, "grad_norm": 0.3046875, "grad_norm_var": 0.01051316261291504, "learning_rate": 0.0001, "loss": 1.5384, "loss/crossentropy": 2.605441689491272, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.23369435966014862, "step": 12370 }, { "epoch": 0.18472588267793547, "grad_norm": 0.328125, "grad_norm_var": 0.010150003433227538, "learning_rate": 0.0001, "loss": 1.4703, "loss/crossentropy": 2.838473916053772, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20076341181993484, "step": 12371 }, { "epoch": 0.18474081484855046, "grad_norm": 0.32421875, "grad_norm_var": 0.00996246337890625, "learning_rate": 0.0001, "loss": 1.5156, "loss/crossentropy": 2.4572906494140625, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.214787058532238, "step": 12372 }, { "epoch": 0.18475574701916544, "grad_norm": 0.330078125, "grad_norm_var": 0.00988780657450358, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.543274164199829, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1726907193660736, "step": 12373 }, { "epoch": 0.18477067918978043, "grad_norm": 0.296875, "grad_norm_var": 0.010002628962198893, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.731650471687317, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19511255621910095, "step": 12374 }, { "epoch": 0.1847856113603954, "grad_norm": 0.302734375, "grad_norm_var": 0.010098711649576823, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.7051254510879517, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18782421946525574, "step": 12375 }, { "epoch": 0.18480054353101039, "grad_norm": 0.294921875, "grad_norm_var": 0.009785715738932292, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.6838210821151733, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18343952298164368, "step": 12376 }, { "epoch": 0.18481547570162538, "grad_norm": 0.279296875, "grad_norm_var": 0.009818967183430989, "learning_rate": 0.0001, "loss": 1.493, "loss/crossentropy": 2.4429216384887695, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21959027647972107, "step": 12377 }, { "epoch": 0.18483040787224034, "grad_norm": 0.296875, "grad_norm_var": 0.009629758199055989, "learning_rate": 0.0001, "loss": 1.332, "loss/crossentropy": 2.6817290782928467, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18356165289878845, "step": 12378 }, { "epoch": 0.18484534004285533, "grad_norm": 0.3359375, "grad_norm_var": 0.009394439061482747, "learning_rate": 0.0001, "loss": 1.5588, "loss/crossentropy": 2.3914432525634766, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2111116126179695, "step": 12379 }, { "epoch": 0.18486027221347032, "grad_norm": 0.275390625, "grad_norm_var": 0.0008391698201497395, "learning_rate": 0.0001, "loss": 1.2597, "loss/crossentropy": 2.6353440284729004, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.15809479355812073, "step": 12380 }, { "epoch": 0.18487520438408528, "grad_norm": 0.27734375, "grad_norm_var": 0.000669097900390625, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.5585074424743652, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1679745316505432, "step": 12381 }, { "epoch": 0.18489013655470027, "grad_norm": 0.6796875, "grad_norm_var": 0.009162505467732748, "learning_rate": 0.0001, "loss": 1.7408, "loss/crossentropy": 2.6488736867904663, "loss/fcd": 1.4765625, "loss/idx": 11.0, "loss/logits": 0.2642545625567436, "step": 12382 }, { "epoch": 0.18490506872531526, "grad_norm": 0.322265625, "grad_norm_var": 0.009151140848795572, "learning_rate": 0.0001, "loss": 1.373, "loss/crossentropy": 2.617241144180298, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18552204221487045, "step": 12383 }, { "epoch": 0.18492000089593025, "grad_norm": 0.2734375, "grad_norm_var": 0.009258000055948894, "learning_rate": 0.0001, "loss": 1.4321, "loss/crossentropy": 2.7777549028396606, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21723710000514984, "step": 12384 }, { "epoch": 0.1849349330665452, "grad_norm": 0.359375, "grad_norm_var": 0.009307352701822917, "learning_rate": 0.0001, "loss": 1.6494, "loss/crossentropy": 2.5655468702316284, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.25098007172346115, "step": 12385 }, { "epoch": 0.1849498652371602, "grad_norm": 0.365234375, "grad_norm_var": 0.009331496556599934, "learning_rate": 0.0001, "loss": 1.3734, "loss/crossentropy": 2.6331132650375366, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18199248611927032, "step": 12386 }, { "epoch": 0.1849647974077752, "grad_norm": 0.28515625, "grad_norm_var": 0.009479761123657227, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.33207368850708, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17417369037866592, "step": 12387 }, { "epoch": 0.18497972957839015, "grad_norm": 0.3125, "grad_norm_var": 0.009499216079711914, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 2.711247205734253, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19083797931671143, "step": 12388 }, { "epoch": 0.18499466174900514, "grad_norm": 0.29296875, "grad_norm_var": 0.00958709716796875, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 2.7185888290405273, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17531975358724594, "step": 12389 }, { "epoch": 0.18500959391962013, "grad_norm": 0.330078125, "grad_norm_var": 0.009517653783162435, "learning_rate": 0.0001, "loss": 1.5279, "loss/crossentropy": 2.564876079559326, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2153826504945755, "step": 12390 }, { "epoch": 0.18502452609023512, "grad_norm": 0.3203125, "grad_norm_var": 0.00947259267171224, "learning_rate": 0.0001, "loss": 1.5361, "loss/crossentropy": 2.483034372329712, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.21971184760332108, "step": 12391 }, { "epoch": 0.18503945826085008, "grad_norm": 0.296875, "grad_norm_var": 0.009463357925415038, "learning_rate": 0.0001, "loss": 1.4107, "loss/crossentropy": 2.6650147438049316, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19975419342517853, "step": 12392 }, { "epoch": 0.18505439043146507, "grad_norm": 0.43359375, "grad_norm_var": 0.009878985087076823, "learning_rate": 0.0001, "loss": 1.5779, "loss/crossentropy": 3.4308031797409058, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.17557043582201004, "step": 12393 }, { "epoch": 0.18506932260208006, "grad_norm": 0.326171875, "grad_norm_var": 0.009760014216105143, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.7749937772750854, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20074458420276642, "step": 12394 }, { "epoch": 0.18508425477269502, "grad_norm": 0.326171875, "grad_norm_var": 0.009775034586588542, "learning_rate": 0.0001, "loss": 1.3102, "loss/crossentropy": 2.7414135932922363, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.18127456307411194, "step": 12395 }, { "epoch": 0.18509918694331, "grad_norm": 0.294921875, "grad_norm_var": 0.009624671936035157, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.6309884786605835, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19204458594322205, "step": 12396 }, { "epoch": 0.185114119113925, "grad_norm": 0.28515625, "grad_norm_var": 0.009559567769368489, "learning_rate": 0.0001, "loss": 1.4117, "loss/crossentropy": 2.653838276863098, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19293688982725143, "step": 12397 }, { "epoch": 0.18512905128453996, "grad_norm": 0.2890625, "grad_norm_var": 0.0016122817993164062, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.669757604598999, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18464574217796326, "step": 12398 }, { "epoch": 0.18514398345515495, "grad_norm": 0.25390625, "grad_norm_var": 0.00187986691792806, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.4875251054763794, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.17497556656599045, "step": 12399 }, { "epoch": 0.18515891562576994, "grad_norm": 0.333984375, "grad_norm_var": 0.0017709732055664062, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.630878448486328, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2117088884115219, "step": 12400 }, { "epoch": 0.18517384779638493, "grad_norm": 0.298828125, "grad_norm_var": 0.0016748905181884766, "learning_rate": 0.0001, "loss": 1.5086, "loss/crossentropy": 2.5469707250595093, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22730092704296112, "step": 12401 }, { "epoch": 0.1851887799669999, "grad_norm": 0.318359375, "grad_norm_var": 0.0015001773834228515, "learning_rate": 0.0001, "loss": 1.4871, "loss/crossentropy": 2.684166193008423, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21368873119354248, "step": 12402 }, { "epoch": 0.18520371213761488, "grad_norm": 0.3359375, "grad_norm_var": 0.0014770348866780599, "learning_rate": 0.0001, "loss": 1.6347, "loss/crossentropy": 2.4766398668289185, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.248010516166687, "step": 12403 }, { "epoch": 0.18521864430822987, "grad_norm": 0.3515625, "grad_norm_var": 0.0015565077463785806, "learning_rate": 0.0001, "loss": 1.6098, "loss/crossentropy": 2.5624899864196777, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2581876367330551, "step": 12404 }, { "epoch": 0.18523357647884484, "grad_norm": 0.283203125, "grad_norm_var": 0.0015950520833333333, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.483606696128845, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.15860647708177567, "step": 12405 }, { "epoch": 0.18524850864945983, "grad_norm": 0.271484375, "grad_norm_var": 0.0017104466756184896, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.545931100845337, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19466593861579895, "step": 12406 }, { "epoch": 0.18526344082007482, "grad_norm": 0.28125, "grad_norm_var": 0.0017714818318684896, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.408697485923767, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20550568401813507, "step": 12407 }, { "epoch": 0.1852783729906898, "grad_norm": 0.310546875, "grad_norm_var": 0.001756906509399414, "learning_rate": 0.0001, "loss": 1.3143, "loss/crossentropy": 2.719214916229248, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.16974755376577377, "step": 12408 }, { "epoch": 0.18529330516130477, "grad_norm": 0.33984375, "grad_norm_var": 0.0007879734039306641, "learning_rate": 0.0001, "loss": 1.5848, "loss/crossentropy": 2.5673668384552, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2489049807190895, "step": 12409 }, { "epoch": 0.18530823733191976, "grad_norm": 0.357421875, "grad_norm_var": 0.0009319146474202473, "learning_rate": 0.0001, "loss": 1.4263, "loss/crossentropy": 3.1289193630218506, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20360340923070908, "step": 12410 }, { "epoch": 0.18532316950253475, "grad_norm": 0.306640625, "grad_norm_var": 0.0009090264638264973, "learning_rate": 0.0001, "loss": 1.4997, "loss/crossentropy": 2.582487106323242, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2067352682352066, "step": 12411 }, { "epoch": 0.1853381016731497, "grad_norm": 0.294921875, "grad_norm_var": 0.0009090264638264973, "learning_rate": 0.0001, "loss": 1.3538, "loss/crossentropy": 2.6281538009643555, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18189889937639236, "step": 12412 }, { "epoch": 0.1853530338437647, "grad_norm": 0.357421875, "grad_norm_var": 0.0010248819986979167, "learning_rate": 0.0001, "loss": 1.7026, "loss/crossentropy": 2.4845272302627563, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.2650892958045006, "step": 12413 }, { "epoch": 0.1853679660143797, "grad_norm": 0.419921875, "grad_norm_var": 0.0017032464345296224, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.5100384950637817, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.27663882076740265, "step": 12414 }, { "epoch": 0.18538289818499465, "grad_norm": 0.275390625, "grad_norm_var": 0.0015436172485351562, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.7317419052124023, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.181790292263031, "step": 12415 }, { "epoch": 0.18539783035560964, "grad_norm": 0.376953125, "grad_norm_var": 0.0017331441243489583, "learning_rate": 0.0001, "loss": 1.4409, "loss/crossentropy": 2.6390820741653442, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1987307444214821, "step": 12416 }, { "epoch": 0.18541276252622463, "grad_norm": 0.357421875, "grad_norm_var": 0.0017531712849934895, "learning_rate": 0.0001, "loss": 1.679, "loss/crossentropy": 2.7569369077682495, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2688296362757683, "step": 12417 }, { "epoch": 0.18542769469683962, "grad_norm": 0.2890625, "grad_norm_var": 0.0018421014149983723, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.552993655204773, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18531106412410736, "step": 12418 }, { "epoch": 0.18544262686745458, "grad_norm": 0.330078125, "grad_norm_var": 0.0018361409505208333, "learning_rate": 0.0001, "loss": 1.5942, "loss/crossentropy": 2.588745355606079, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.27002574503421783, "step": 12419 }, { "epoch": 0.18545755903806957, "grad_norm": 0.298828125, "grad_norm_var": 0.0018245538075764975, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.6876447200775146, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20441776514053345, "step": 12420 }, { "epoch": 0.18547249120868456, "grad_norm": 0.2890625, "grad_norm_var": 0.0017964680989583333, "learning_rate": 0.0001, "loss": 1.3177, "loss/crossentropy": 2.700550079345703, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1653803139925003, "step": 12421 }, { "epoch": 0.18548742337929952, "grad_norm": 0.40625, "grad_norm_var": 0.0020191033681233725, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.5926132202148438, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.22312818467617035, "step": 12422 }, { "epoch": 0.1855023555499145, "grad_norm": 0.330078125, "grad_norm_var": 0.0018462498982747396, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.395575761795044, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20733490586280823, "step": 12423 }, { "epoch": 0.1855172877205295, "grad_norm": 0.287109375, "grad_norm_var": 0.0019530614217122396, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.5301390886306763, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.170570470392704, "step": 12424 }, { "epoch": 0.1855322198911445, "grad_norm": 0.287109375, "grad_norm_var": 0.0020736535390218098, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.7866437435150146, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20960143953561783, "step": 12425 }, { "epoch": 0.18554715206175945, "grad_norm": 0.291015625, "grad_norm_var": 0.0020974318186442057, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.65723979473114, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18707461655139923, "step": 12426 }, { "epoch": 0.18556208423237444, "grad_norm": 0.28515625, "grad_norm_var": 0.002178382873535156, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.6542043685913086, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20334542542696, "step": 12427 }, { "epoch": 0.18557701640298943, "grad_norm": 0.30078125, "grad_norm_var": 0.0021582126617431642, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.714603066444397, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2024693787097931, "step": 12428 }, { "epoch": 0.1855919485736044, "grad_norm": 0.306640625, "grad_norm_var": 0.0020920912424723308, "learning_rate": 0.0001, "loss": 1.5053, "loss/crossentropy": 2.626936435699463, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22403442859649658, "step": 12429 }, { "epoch": 0.18560688074421938, "grad_norm": 0.322265625, "grad_norm_var": 0.0013959089914957683, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.5691839456558228, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19768106192350388, "step": 12430 }, { "epoch": 0.18562181291483437, "grad_norm": 0.326171875, "grad_norm_var": 0.0012917677561442057, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.620153784751892, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1716870740056038, "step": 12431 }, { "epoch": 0.18563674508544933, "grad_norm": 0.80078125, "grad_norm_var": 0.01586430867513021, "learning_rate": 0.0001, "loss": 1.9784, "loss/crossentropy": 2.2727649807929993, "loss/fcd": 1.66015625, "loss/idx": 11.0, "loss/logits": 0.31820718944072723, "step": 12432 }, { "epoch": 0.18565167725606432, "grad_norm": 0.36328125, "grad_norm_var": 0.01587675412495931, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.6695139408111572, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20323865115642548, "step": 12433 }, { "epoch": 0.1856666094266793, "grad_norm": 0.318359375, "grad_norm_var": 0.01571343739827474, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.86322557926178, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2058342695236206, "step": 12434 }, { "epoch": 0.1856815415972943, "grad_norm": 0.326171875, "grad_norm_var": 0.015722910563151043, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.7090083360671997, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21755358576774597, "step": 12435 }, { "epoch": 0.18569647376790926, "grad_norm": 0.283203125, "grad_norm_var": 0.01583684285481771, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.6970423460006714, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19677217304706573, "step": 12436 }, { "epoch": 0.18571140593852425, "grad_norm": 0.35546875, "grad_norm_var": 0.015615272521972656, "learning_rate": 0.0001, "loss": 1.5969, "loss/crossentropy": 2.5817599296569824, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2492770254611969, "step": 12437 }, { "epoch": 0.18572633810913924, "grad_norm": 0.259765625, "grad_norm_var": 0.015845346450805663, "learning_rate": 0.0001, "loss": 1.2562, "loss/crossentropy": 2.5179048776626587, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.15853530913591385, "step": 12438 }, { "epoch": 0.1857412702797542, "grad_norm": 0.330078125, "grad_norm_var": 0.015845346450805663, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.3463319540023804, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.19662468135356903, "step": 12439 }, { "epoch": 0.1857562024503692, "grad_norm": 0.302734375, "grad_norm_var": 0.01574997901916504, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 3.0467945337295532, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18672039359807968, "step": 12440 }, { "epoch": 0.18577113462098419, "grad_norm": 0.470703125, "grad_norm_var": 0.01653288205464681, "learning_rate": 0.0001, "loss": 1.7544, "loss/crossentropy": 2.709073066711426, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.29350346326828003, "step": 12441 }, { "epoch": 0.18578606679159915, "grad_norm": 0.314453125, "grad_norm_var": 0.016374572118123373, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.75929594039917, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19546733796596527, "step": 12442 }, { "epoch": 0.18580099896221414, "grad_norm": 0.326171875, "grad_norm_var": 0.01610253651936849, "learning_rate": 0.0001, "loss": 1.3535, "loss/crossentropy": 2.6015766859054565, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1659909263253212, "step": 12443 }, { "epoch": 0.18581593113282913, "grad_norm": 0.298828125, "grad_norm_var": 0.016117334365844727, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.6615830659866333, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20938926190137863, "step": 12444 }, { "epoch": 0.18583086330344412, "grad_norm": 0.31640625, "grad_norm_var": 0.01605828603108724, "learning_rate": 0.0001, "loss": 1.3171, "loss/crossentropy": 2.698778510093689, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1765163242816925, "step": 12445 }, { "epoch": 0.18584579547405908, "grad_norm": 0.34375, "grad_norm_var": 0.015987126032511394, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.485788106918335, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19029894471168518, "step": 12446 }, { "epoch": 0.18586072764467407, "grad_norm": 0.30859375, "grad_norm_var": 0.016082255045572917, "learning_rate": 0.0001, "loss": 1.2687, "loss/crossentropy": 2.586641550064087, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.17104724794626236, "step": 12447 }, { "epoch": 0.18587565981528906, "grad_norm": 0.28515625, "grad_norm_var": 0.0022181193033854168, "learning_rate": 0.0001, "loss": 1.3943, "loss/crossentropy": 2.615064263343811, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19114013016223907, "step": 12448 }, { "epoch": 0.18589059198590402, "grad_norm": 0.33984375, "grad_norm_var": 0.0021334330240885416, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.8220834732055664, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21517911553382874, "step": 12449 }, { "epoch": 0.185905524156519, "grad_norm": 0.431640625, "grad_norm_var": 0.0028543472290039062, "learning_rate": 0.0001, "loss": 1.8208, "loss/crossentropy": 2.4956860542297363, "loss/fcd": 1.484375, "loss/idx": 11.0, "loss/logits": 0.3363960087299347, "step": 12450 }, { "epoch": 0.185920456327134, "grad_norm": 0.326171875, "grad_norm_var": 0.0028543472290039062, "learning_rate": 0.0001, "loss": 1.4472, "loss/crossentropy": 2.486501455307007, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.22058884799480438, "step": 12451 }, { "epoch": 0.185935388497749, "grad_norm": 0.349609375, "grad_norm_var": 0.00270843505859375, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.5918431282043457, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17186283320188522, "step": 12452 }, { "epoch": 0.18595032066836395, "grad_norm": 0.318359375, "grad_norm_var": 0.002693033218383789, "learning_rate": 0.0001, "loss": 1.3671, "loss/crossentropy": 2.7319835424423218, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1834961175918579, "step": 12453 }, { "epoch": 0.18596525283897894, "grad_norm": 0.314453125, "grad_norm_var": 0.0023485660552978516, "learning_rate": 0.0001, "loss": 1.3459, "loss/crossentropy": 2.751554250717163, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1779419705271721, "step": 12454 }, { "epoch": 0.18598018500959393, "grad_norm": 0.341796875, "grad_norm_var": 0.0023478031158447265, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.3877865076065063, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19751526415348053, "step": 12455 }, { "epoch": 0.1859951171802089, "grad_norm": 0.3203125, "grad_norm_var": 0.00228729248046875, "learning_rate": 0.0001, "loss": 1.3029, "loss/crossentropy": 2.645153284072876, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17786924540996552, "step": 12456 }, { "epoch": 0.18601004935082388, "grad_norm": 0.359375, "grad_norm_var": 0.0010904788970947266, "learning_rate": 0.0001, "loss": 1.4782, "loss/crossentropy": 2.6678653955459595, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20863424241542816, "step": 12457 }, { "epoch": 0.18602498152143887, "grad_norm": 0.30859375, "grad_norm_var": 0.001105499267578125, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.2946817874908447, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19016186147928238, "step": 12458 }, { "epoch": 0.18603991369205383, "grad_norm": 0.310546875, "grad_norm_var": 0.001129913330078125, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.630358934402466, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18058205395936966, "step": 12459 }, { "epoch": 0.18605484586266882, "grad_norm": 0.34375, "grad_norm_var": 0.001071786880493164, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.637877583503723, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.19935619086027145, "step": 12460 }, { "epoch": 0.1860697780332838, "grad_norm": 0.2890625, "grad_norm_var": 0.0011768182118733724, "learning_rate": 0.0001, "loss": 1.3733, "loss/crossentropy": 2.6628050804138184, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1858382448554039, "step": 12461 }, { "epoch": 0.1860847102038988, "grad_norm": 0.361328125, "grad_norm_var": 0.0012267430623372395, "learning_rate": 0.0001, "loss": 1.5654, "loss/crossentropy": 2.795600414276123, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.24508382380008698, "step": 12462 }, { "epoch": 0.18609964237451376, "grad_norm": 0.287109375, "grad_norm_var": 0.001322031021118164, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.5005279779434204, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18217162787914276, "step": 12463 }, { "epoch": 0.18611457454512875, "grad_norm": 0.28515625, "grad_norm_var": 0.001322031021118164, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.638399124145508, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20182989537715912, "step": 12464 }, { "epoch": 0.18612950671574374, "grad_norm": 0.26953125, "grad_norm_var": 0.0015429019927978515, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.596429944038391, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2338736578822136, "step": 12465 }, { "epoch": 0.1861444388863587, "grad_norm": 0.2890625, "grad_norm_var": 0.0008061091105143229, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.4248509407043457, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18235738575458527, "step": 12466 }, { "epoch": 0.1861593710569737, "grad_norm": 0.271484375, "grad_norm_var": 0.0009271621704101563, "learning_rate": 0.0001, "loss": 1.2933, "loss/crossentropy": 2.5950753688812256, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16825871169567108, "step": 12467 }, { "epoch": 0.18617430322758868, "grad_norm": 0.322265625, "grad_norm_var": 0.000843048095703125, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.51039457321167, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18267075717449188, "step": 12468 }, { "epoch": 0.18618923539820367, "grad_norm": 0.3125, "grad_norm_var": 0.0008402347564697265, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.569144129753113, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21883559972047806, "step": 12469 }, { "epoch": 0.18620416756881863, "grad_norm": 0.3203125, "grad_norm_var": 0.000844573974609375, "learning_rate": 0.0001, "loss": 1.4161, "loss/crossentropy": 2.495503544807434, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.18169962614774704, "step": 12470 }, { "epoch": 0.18621909973943362, "grad_norm": 0.2890625, "grad_norm_var": 0.0008089542388916016, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.7705469131469727, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1777762994170189, "step": 12471 }, { "epoch": 0.18623403191004861, "grad_norm": 0.3046875, "grad_norm_var": 0.0008000532786051432, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.590726137161255, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.16506792604923248, "step": 12472 }, { "epoch": 0.18624896408066358, "grad_norm": 0.306640625, "grad_norm_var": 0.0006107966105143229, "learning_rate": 0.0001, "loss": 1.3523, "loss/crossentropy": 2.864351511001587, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18037782609462738, "step": 12473 }, { "epoch": 0.18626389625127857, "grad_norm": 0.267578125, "grad_norm_var": 0.0006932417551676432, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.497450828552246, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.15046662092208862, "step": 12474 }, { "epoch": 0.18627882842189356, "grad_norm": 0.275390625, "grad_norm_var": 0.0007298628489176433, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.649176597595215, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21373900771141052, "step": 12475 }, { "epoch": 0.18629376059250852, "grad_norm": 0.3203125, "grad_norm_var": 0.0006264845530192057, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.3757532835006714, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1870335042476654, "step": 12476 }, { "epoch": 0.1863086927631235, "grad_norm": 0.32421875, "grad_norm_var": 0.0006608168284098307, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.6536242961883545, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1950809210538864, "step": 12477 }, { "epoch": 0.1863236249337385, "grad_norm": 0.26171875, "grad_norm_var": 0.0004719416300455729, "learning_rate": 0.0001, "loss": 1.354, "loss/crossentropy": 2.528105854988098, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17430441081523895, "step": 12478 }, { "epoch": 0.18633855710435349, "grad_norm": 0.296875, "grad_norm_var": 0.00046868324279785155, "learning_rate": 0.0001, "loss": 1.3694, "loss/crossentropy": 2.6114447116851807, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18967673182487488, "step": 12479 }, { "epoch": 0.18635348927496845, "grad_norm": 0.279296875, "grad_norm_var": 0.000478363037109375, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.6767687797546387, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19977965205907822, "step": 12480 }, { "epoch": 0.18636842144558344, "grad_norm": 0.279296875, "grad_norm_var": 0.0004518985748291016, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.462967872619629, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1992909088730812, "step": 12481 }, { "epoch": 0.18638335361619843, "grad_norm": 0.359375, "grad_norm_var": 0.000704813003540039, "learning_rate": 0.0001, "loss": 1.6318, "loss/crossentropy": 2.4524513483047485, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.2372460961341858, "step": 12482 }, { "epoch": 0.1863982857868134, "grad_norm": 0.26953125, "grad_norm_var": 0.0007123311360677083, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.786038637161255, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1959560513496399, "step": 12483 }, { "epoch": 0.18641321795742838, "grad_norm": 0.3125, "grad_norm_var": 0.0006884098052978516, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.642967700958252, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.24955663084983826, "step": 12484 }, { "epoch": 0.18642815012804337, "grad_norm": 0.287109375, "grad_norm_var": 0.0006820042928059896, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.4981974363327026, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20019420981407166, "step": 12485 }, { "epoch": 0.18644308229865836, "grad_norm": 0.265625, "grad_norm_var": 0.0006998062133789062, "learning_rate": 0.0001, "loss": 1.2846, "loss/crossentropy": 2.602507710456848, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.17523867636919022, "step": 12486 }, { "epoch": 0.18645801446927332, "grad_norm": 0.296875, "grad_norm_var": 0.0006987889607747396, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.6745306253433228, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20101554691791534, "step": 12487 }, { "epoch": 0.1864729466398883, "grad_norm": 0.30078125, "grad_norm_var": 0.00069427490234375, "learning_rate": 0.0001, "loss": 1.3689, "loss/crossentropy": 2.834262251853943, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1853337287902832, "step": 12488 }, { "epoch": 0.1864878788105033, "grad_norm": 0.26953125, "grad_norm_var": 0.0007175286610921224, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.573678493499756, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.1986127346754074, "step": 12489 }, { "epoch": 0.18650281098111826, "grad_norm": 0.3203125, "grad_norm_var": 0.0007222493489583334, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.698819637298584, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.211601160466671, "step": 12490 }, { "epoch": 0.18651774315173325, "grad_norm": 0.28125, "grad_norm_var": 0.0007091363271077474, "learning_rate": 0.0001, "loss": 1.3851, "loss/crossentropy": 2.658362627029419, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18201985955238342, "step": 12491 }, { "epoch": 0.18653267532234824, "grad_norm": 0.32421875, "grad_norm_var": 0.0007231235504150391, "learning_rate": 0.0001, "loss": 1.5906, "loss/crossentropy": 2.5890854597091675, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.2507771924138069, "step": 12492 }, { "epoch": 0.1865476074929632, "grad_norm": 0.298828125, "grad_norm_var": 0.0006663004557291666, "learning_rate": 0.0001, "loss": 1.3853, "loss/crossentropy": 2.5133787393569946, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19390791654586792, "step": 12493 }, { "epoch": 0.1865625396635782, "grad_norm": 0.98828125, "grad_norm_var": 0.03053766886393229, "learning_rate": 0.0001, "loss": 1.7454, "loss/crossentropy": 3.4186266660690308, "loss/fcd": 1.5234375, "loss/idx": 11.0, "loss/logits": 0.2219809740781784, "step": 12494 }, { "epoch": 0.18657747183419318, "grad_norm": 0.28125, "grad_norm_var": 0.03064142862955729, "learning_rate": 0.0001, "loss": 1.3764, "loss/crossentropy": 2.5118011236190796, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19672077149152756, "step": 12495 }, { "epoch": 0.18659240400480817, "grad_norm": 0.33203125, "grad_norm_var": 0.030399815241495768, "learning_rate": 0.0001, "loss": 1.5118, "loss/crossentropy": 2.4702389240264893, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21883351355791092, "step": 12496 }, { "epoch": 0.18660733617542313, "grad_norm": 0.30078125, "grad_norm_var": 0.030249977111816408, "learning_rate": 0.0001, "loss": 1.3922, "loss/crossentropy": 2.4977502822875977, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18126758188009262, "step": 12497 }, { "epoch": 0.18662226834603812, "grad_norm": 0.306640625, "grad_norm_var": 0.03030877113342285, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.6330032348632812, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1932658702135086, "step": 12498 }, { "epoch": 0.1866372005166531, "grad_norm": 0.267578125, "grad_norm_var": 0.030327288309733073, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.533882260322571, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.19180525839328766, "step": 12499 }, { "epoch": 0.18665213268726807, "grad_norm": 0.30859375, "grad_norm_var": 0.03034235636393229, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.7930808067321777, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18088626861572266, "step": 12500 }, { "epoch": 0.18666706485788306, "grad_norm": 0.318359375, "grad_norm_var": 0.030185699462890625, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.520902633666992, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20681579411029816, "step": 12501 }, { "epoch": 0.18668199702849805, "grad_norm": 0.326171875, "grad_norm_var": 0.02980383237202962, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.7817498445510864, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2155281826853752, "step": 12502 }, { "epoch": 0.18669692919911302, "grad_norm": 0.341796875, "grad_norm_var": 0.029641151428222656, "learning_rate": 0.0001, "loss": 1.6197, "loss/crossentropy": 2.657804012298584, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.24864359200000763, "step": 12503 }, { "epoch": 0.186711861369728, "grad_norm": 0.33984375, "grad_norm_var": 0.029491106669108074, "learning_rate": 0.0001, "loss": 1.6707, "loss/crossentropy": 2.493882179260254, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.26838476955890656, "step": 12504 }, { "epoch": 0.186726793540343, "grad_norm": 0.3125, "grad_norm_var": 0.029143524169921876, "learning_rate": 0.0001, "loss": 1.4604, "loss/crossentropy": 2.692686676979065, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21428925544023514, "step": 12505 }, { "epoch": 0.18674172571095798, "grad_norm": 0.296875, "grad_norm_var": 0.02928009033203125, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.6159937381744385, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18825732171535492, "step": 12506 }, { "epoch": 0.18675665788157295, "grad_norm": 0.32421875, "grad_norm_var": 0.028992652893066406, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.571266293525696, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18335910141468048, "step": 12507 }, { "epoch": 0.18677159005218794, "grad_norm": 0.33984375, "grad_norm_var": 0.028945350646972658, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.6322001218795776, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.17405786365270615, "step": 12508 }, { "epoch": 0.18678652222280293, "grad_norm": 0.322265625, "grad_norm_var": 0.028803443908691405, "learning_rate": 0.0001, "loss": 1.4119, "loss/crossentropy": 2.5328891277313232, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19319070875644684, "step": 12509 }, { "epoch": 0.1868014543934179, "grad_norm": 0.271484375, "grad_norm_var": 0.0005527337392171224, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.379374146461487, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1981016844511032, "step": 12510 }, { "epoch": 0.18681638656403288, "grad_norm": 0.30859375, "grad_norm_var": 0.00048775672912597655, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.435872197151184, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23521476238965988, "step": 12511 }, { "epoch": 0.18683131873464787, "grad_norm": 0.333984375, "grad_norm_var": 0.0004927953084309896, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.4552897214889526, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20250707864761353, "step": 12512 }, { "epoch": 0.18684625090526286, "grad_norm": 0.28125, "grad_norm_var": 0.0005503336588541667, "learning_rate": 0.0001, "loss": 1.3, "loss/crossentropy": 2.6938780546188354, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17499258369207382, "step": 12513 }, { "epoch": 0.18686118307587782, "grad_norm": 0.474609375, "grad_norm_var": 0.002182451883951823, "learning_rate": 0.0001, "loss": 2.3796, "loss/crossentropy": 2.7524484395980835, "loss/fcd": 1.90234375, "loss/idx": 11.0, "loss/logits": 0.4772570878267288, "step": 12514 }, { "epoch": 0.1868761152464928, "grad_norm": 0.314453125, "grad_norm_var": 0.001973406473795573, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.684661865234375, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20359548926353455, "step": 12515 }, { "epoch": 0.1868910474171078, "grad_norm": 0.2890625, "grad_norm_var": 0.002042388916015625, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.733467221260071, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18865618109703064, "step": 12516 }, { "epoch": 0.18690597958772276, "grad_norm": 0.3828125, "grad_norm_var": 0.002247476577758789, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.6181007623672485, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23158501088619232, "step": 12517 }, { "epoch": 0.18692091175833775, "grad_norm": 0.3046875, "grad_norm_var": 0.002283668518066406, "learning_rate": 0.0001, "loss": 1.4751, "loss/crossentropy": 2.562911033630371, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20558065176010132, "step": 12518 }, { "epoch": 0.18693584392895274, "grad_norm": 0.2578125, "grad_norm_var": 0.0025632063547770183, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.4985129833221436, "loss/fcd": 1.078125, "loss/idx": 11.0, "loss/logits": 0.16376738250255585, "step": 12519 }, { "epoch": 0.1869507760995677, "grad_norm": 0.2890625, "grad_norm_var": 0.0026045322418212892, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.667526364326477, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18844157457351685, "step": 12520 }, { "epoch": 0.1869657082701827, "grad_norm": 0.294921875, "grad_norm_var": 0.002639007568359375, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.3893978595733643, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18476419150829315, "step": 12521 }, { "epoch": 0.18698064044079768, "grad_norm": 0.302734375, "grad_norm_var": 0.0026247501373291016, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.6614837646484375, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1825784593820572, "step": 12522 }, { "epoch": 0.18699557261141267, "grad_norm": 0.314453125, "grad_norm_var": 0.0026229222615559897, "learning_rate": 0.0001, "loss": 1.4898, "loss/crossentropy": 2.69948148727417, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.23196033388376236, "step": 12523 }, { "epoch": 0.18701050478202763, "grad_norm": 0.26953125, "grad_norm_var": 0.0027236302693684896, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.5676709413528442, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.18692418932914734, "step": 12524 }, { "epoch": 0.18702543695264262, "grad_norm": 0.34375, "grad_norm_var": 0.002778355280558268, "learning_rate": 0.0001, "loss": 1.516, "loss/crossentropy": 2.513038754463196, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.222983255982399, "step": 12525 }, { "epoch": 0.1870403691232576, "grad_norm": 0.345703125, "grad_norm_var": 0.00269621213277181, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.531078815460205, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19232311844825745, "step": 12526 }, { "epoch": 0.18705530129387257, "grad_norm": 0.263671875, "grad_norm_var": 0.0028859456380208332, "learning_rate": 0.0001, "loss": 1.3131, "loss/crossentropy": 2.745425581932068, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1803155392408371, "step": 12527 }, { "epoch": 0.18707023346448756, "grad_norm": 0.298828125, "grad_norm_var": 0.0028807957967122394, "learning_rate": 0.0001, "loss": 1.3377, "loss/crossentropy": 2.8490335941314697, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17367655038833618, "step": 12528 }, { "epoch": 0.18708516563510255, "grad_norm": 0.275390625, "grad_norm_var": 0.002908690770467122, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.5586479902267456, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19192630052566528, "step": 12529 }, { "epoch": 0.18710009780571754, "grad_norm": 0.302734375, "grad_norm_var": 0.0010707696278889974, "learning_rate": 0.0001, "loss": 1.3247, "loss/crossentropy": 2.5289456844329834, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1762806698679924, "step": 12530 }, { "epoch": 0.1871150299763325, "grad_norm": 0.322265625, "grad_norm_var": 0.00108640988667806, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.5126473903656006, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21643562614917755, "step": 12531 }, { "epoch": 0.1871299621469475, "grad_norm": 0.3203125, "grad_norm_var": 0.0010869185129801433, "learning_rate": 0.0001, "loss": 1.5301, "loss/crossentropy": 2.4154245853424072, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.26059187948703766, "step": 12532 }, { "epoch": 0.18714489431756248, "grad_norm": 0.310546875, "grad_norm_var": 0.0006687800089518229, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.744835138320923, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20478133112192154, "step": 12533 }, { "epoch": 0.18715982648817744, "grad_norm": 0.345703125, "grad_norm_var": 0.0007939497629801432, "learning_rate": 0.0001, "loss": 1.5585, "loss/crossentropy": 2.605484962463379, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22648775577545166, "step": 12534 }, { "epoch": 0.18717475865879243, "grad_norm": 0.328125, "grad_norm_var": 0.0006737867991129557, "learning_rate": 0.0001, "loss": 1.5669, "loss/crossentropy": 2.2812306880950928, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2114058881998062, "step": 12535 }, { "epoch": 0.18718969082940742, "grad_norm": 0.33203125, "grad_norm_var": 0.0006807804107666016, "learning_rate": 0.0001, "loss": 1.2498, "loss/crossentropy": 2.547516107559204, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.159935362637043, "step": 12536 }, { "epoch": 0.18720462300002239, "grad_norm": 0.275390625, "grad_norm_var": 0.0007456302642822266, "learning_rate": 0.0001, "loss": 1.346, "loss/crossentropy": 2.5073682069778442, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18972881883382797, "step": 12537 }, { "epoch": 0.18721955517063738, "grad_norm": 0.306640625, "grad_norm_var": 0.0007430871327718098, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.658882975578308, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1931850016117096, "step": 12538 }, { "epoch": 0.18723448734125236, "grad_norm": 0.306640625, "grad_norm_var": 0.0007419427235921224, "learning_rate": 0.0001, "loss": 1.4149, "loss/crossentropy": 2.489253520965576, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19615968316793442, "step": 12539 }, { "epoch": 0.18724941951186735, "grad_norm": 0.29296875, "grad_norm_var": 0.0006522973378499349, "learning_rate": 0.0001, "loss": 1.3553, "loss/crossentropy": 2.621869921684265, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19122660160064697, "step": 12540 }, { "epoch": 0.18726435168248232, "grad_norm": 0.333984375, "grad_norm_var": 0.0006151835123697917, "learning_rate": 0.0001, "loss": 1.4974, "loss/crossentropy": 2.8410850763320923, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21226520836353302, "step": 12541 }, { "epoch": 0.1872792838530973, "grad_norm": 0.326171875, "grad_norm_var": 0.0005462010701497396, "learning_rate": 0.0001, "loss": 1.4929, "loss/crossentropy": 2.651952862739563, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22340862452983856, "step": 12542 }, { "epoch": 0.1872942160237123, "grad_norm": 0.29296875, "grad_norm_var": 0.0004234155019124349, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.563937783241272, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20773203670978546, "step": 12543 }, { "epoch": 0.18730914819432726, "grad_norm": 0.3125, "grad_norm_var": 0.00041351318359375, "learning_rate": 0.0001, "loss": 1.4078, "loss/crossentropy": 2.2558826208114624, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18905649334192276, "step": 12544 }, { "epoch": 0.18732408036494225, "grad_norm": 0.6484375, "grad_norm_var": 0.007314030329386393, "learning_rate": 0.0001, "loss": 1.4407, "loss/crossentropy": 2.532215118408203, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1868348866701126, "step": 12545 }, { "epoch": 0.18733901253555724, "grad_norm": 0.36328125, "grad_norm_var": 0.007283973693847656, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.654424786567688, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1861526221036911, "step": 12546 }, { "epoch": 0.18735394470617223, "grad_norm": 0.3515625, "grad_norm_var": 0.007273721694946289, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.594941020011902, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1927110254764557, "step": 12547 }, { "epoch": 0.1873688768767872, "grad_norm": 0.287109375, "grad_norm_var": 0.007431793212890625, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.754029393196106, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19366982579231262, "step": 12548 }, { "epoch": 0.18738380904740218, "grad_norm": 0.341796875, "grad_norm_var": 0.007376861572265625, "learning_rate": 0.0001, "loss": 1.4433, "loss/crossentropy": 2.6288522481918335, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20106948167085648, "step": 12549 }, { "epoch": 0.18739874121801717, "grad_norm": 0.408203125, "grad_norm_var": 0.007665761311848958, "learning_rate": 0.0001, "loss": 1.3658, "loss/crossentropy": 2.470139503479004, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1783139854669571, "step": 12550 }, { "epoch": 0.18741367338863213, "grad_norm": 0.265625, "grad_norm_var": 0.008044179280598958, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.5012179613113403, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18883108347654343, "step": 12551 }, { "epoch": 0.18742860555924712, "grad_norm": 0.3359375, "grad_norm_var": 0.008040809631347656, "learning_rate": 0.0001, "loss": 1.4869, "loss/crossentropy": 2.556317925453186, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21344725042581558, "step": 12552 }, { "epoch": 0.1874435377298621, "grad_norm": 0.291015625, "grad_norm_var": 0.007920265197753906, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.5932587385177612, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.15766698122024536, "step": 12553 }, { "epoch": 0.18745846990047707, "grad_norm": 0.326171875, "grad_norm_var": 0.007853190104166666, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.3484901189804077, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.2255081608891487, "step": 12554 }, { "epoch": 0.18747340207109206, "grad_norm": 0.296875, "grad_norm_var": 0.007906198501586914, "learning_rate": 0.0001, "loss": 1.3053, "loss/crossentropy": 2.4732515811920166, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1646295040845871, "step": 12555 }, { "epoch": 0.18748833424170705, "grad_norm": 0.271484375, "grad_norm_var": 0.008075968424479166, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.6746262311935425, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18091289699077606, "step": 12556 }, { "epoch": 0.18750326641232204, "grad_norm": 0.26953125, "grad_norm_var": 0.008394352595011393, "learning_rate": 0.0001, "loss": 1.3033, "loss/crossentropy": 2.7614129781723022, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17825371772050858, "step": 12557 }, { "epoch": 0.187518198582937, "grad_norm": 0.26953125, "grad_norm_var": 0.008675066630045573, "learning_rate": 0.0001, "loss": 1.259, "loss/crossentropy": 2.5614019632339478, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16131721436977386, "step": 12558 }, { "epoch": 0.187533130753552, "grad_norm": 0.27734375, "grad_norm_var": 0.008774248758951823, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.5639395713806152, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.16656210273504257, "step": 12559 }, { "epoch": 0.18754806292416698, "grad_norm": 0.279296875, "grad_norm_var": 0.008930699030558268, "learning_rate": 0.0001, "loss": 1.4088, "loss/crossentropy": 2.3818124532699585, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19788572192192078, "step": 12560 }, { "epoch": 0.18756299509478194, "grad_norm": 0.330078125, "grad_norm_var": 0.0017567316691080729, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.8695003986358643, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1972125619649887, "step": 12561 }, { "epoch": 0.18757792726539693, "grad_norm": 0.2734375, "grad_norm_var": 0.0016265869140625, "learning_rate": 0.0001, "loss": 1.3326, "loss/crossentropy": 2.4388132095336914, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1723988950252533, "step": 12562 }, { "epoch": 0.18759285943601192, "grad_norm": 0.330078125, "grad_norm_var": 0.001521158218383789, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.5241905450820923, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2099137380719185, "step": 12563 }, { "epoch": 0.18760779160662688, "grad_norm": 0.279296875, "grad_norm_var": 0.0015418847401936849, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.664388418197632, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18578174710273743, "step": 12564 }, { "epoch": 0.18762272377724187, "grad_norm": 0.27734375, "grad_norm_var": 0.0014668782552083334, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.603853225708008, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18468491733074188, "step": 12565 }, { "epoch": 0.18763765594785686, "grad_norm": 0.359375, "grad_norm_var": 0.0009038130442301433, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.5384199619293213, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.1910673901438713, "step": 12566 }, { "epoch": 0.18765258811847185, "grad_norm": 0.302734375, "grad_norm_var": 0.0008406956990559896, "learning_rate": 0.0001, "loss": 1.4573, "loss/crossentropy": 2.4691624641418457, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.1956193819642067, "step": 12567 }, { "epoch": 0.18766752028908681, "grad_norm": 0.328125, "grad_norm_var": 0.0008050918579101563, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.589249610900879, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1762964352965355, "step": 12568 }, { "epoch": 0.1876824524597018, "grad_norm": 0.294921875, "grad_norm_var": 0.0008026123046875, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.615429997444153, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1859496533870697, "step": 12569 }, { "epoch": 0.1876973846303168, "grad_norm": 0.30859375, "grad_norm_var": 0.0007555484771728516, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.4115025997161865, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1783767119050026, "step": 12570 }, { "epoch": 0.18771231680093176, "grad_norm": 0.29296875, "grad_norm_var": 0.0007564385732014973, "learning_rate": 0.0001, "loss": 1.3163, "loss/crossentropy": 2.593737006187439, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1717405691742897, "step": 12571 }, { "epoch": 0.18772724897154675, "grad_norm": 0.287109375, "grad_norm_var": 0.0007195631663004557, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.473276972770691, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2079121693968773, "step": 12572 }, { "epoch": 0.18774218114216173, "grad_norm": 0.27734375, "grad_norm_var": 0.0006942590077718099, "learning_rate": 0.0001, "loss": 1.3513, "loss/crossentropy": 2.715388774871826, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18724246323108673, "step": 12573 }, { "epoch": 0.18775711331277672, "grad_norm": 0.287109375, "grad_norm_var": 0.0006469090779622396, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.5827187299728394, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22882691025733948, "step": 12574 }, { "epoch": 0.1877720454833917, "grad_norm": 0.302734375, "grad_norm_var": 0.0006136417388916016, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.5326199531555176, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19918020069599152, "step": 12575 }, { "epoch": 0.18778697765400668, "grad_norm": 0.291015625, "grad_norm_var": 0.0005888462066650391, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.4614328145980835, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17155639827251434, "step": 12576 }, { "epoch": 0.18780190982462167, "grad_norm": 0.26953125, "grad_norm_var": 0.0005863825480143229, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.627534031867981, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17621289193630219, "step": 12577 }, { "epoch": 0.18781684199523663, "grad_norm": 0.28515625, "grad_norm_var": 0.0005572001139322917, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.5126988887786865, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.21212026476860046, "step": 12578 }, { "epoch": 0.18783177416585162, "grad_norm": 0.333984375, "grad_norm_var": 0.0005746841430664063, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.537639021873474, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18211263418197632, "step": 12579 }, { "epoch": 0.1878467063364666, "grad_norm": 0.328125, "grad_norm_var": 0.0005981286366780598, "learning_rate": 0.0001, "loss": 1.4818, "loss/crossentropy": 2.50072181224823, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.19667928665876389, "step": 12580 }, { "epoch": 0.18786163850708157, "grad_norm": 0.2734375, "grad_norm_var": 0.0006117343902587891, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.586480498313904, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19503501802682877, "step": 12581 }, { "epoch": 0.18787657067769656, "grad_norm": 0.296875, "grad_norm_var": 0.0003726800282796224, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.6848255395889282, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2047795206308365, "step": 12582 }, { "epoch": 0.18789150284831155, "grad_norm": 0.31640625, "grad_norm_var": 0.00039393107096354166, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.4818437099456787, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2087807059288025, "step": 12583 }, { "epoch": 0.18790643501892654, "grad_norm": 0.298828125, "grad_norm_var": 0.0003312269846598307, "learning_rate": 0.0001, "loss": 1.5383, "loss/crossentropy": 2.4027099609375, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22974348813295364, "step": 12584 }, { "epoch": 0.1879213671895415, "grad_norm": 0.3046875, "grad_norm_var": 0.00033512115478515623, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.553757905960083, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22015371173620224, "step": 12585 }, { "epoch": 0.1879362993601565, "grad_norm": 0.259765625, "grad_norm_var": 0.0004094282786051432, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.453120231628418, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19059574604034424, "step": 12586 }, { "epoch": 0.18795123153077148, "grad_norm": 0.30078125, "grad_norm_var": 0.00041209856669108074, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.628813624382019, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.16713877767324448, "step": 12587 }, { "epoch": 0.18796616370138644, "grad_norm": 0.310546875, "grad_norm_var": 0.0004231611887613932, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.490232467651367, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1939765363931656, "step": 12588 }, { "epoch": 0.18798109587200143, "grad_norm": 0.3203125, "grad_norm_var": 0.0004315535227457682, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.6038514375686646, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2208145409822464, "step": 12589 }, { "epoch": 0.18799602804261642, "grad_norm": 0.287109375, "grad_norm_var": 0.0004315535227457682, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.5682421922683716, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21474777162075043, "step": 12590 }, { "epoch": 0.1880109602132314, "grad_norm": 0.302734375, "grad_norm_var": 0.0004315535227457682, "learning_rate": 0.0001, "loss": 1.4811, "loss/crossentropy": 2.5521771907806396, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2116023600101471, "step": 12591 }, { "epoch": 0.18802589238384637, "grad_norm": 0.294921875, "grad_norm_var": 0.0004285017649332682, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.4014710187911987, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21900078654289246, "step": 12592 }, { "epoch": 0.18804082455446136, "grad_norm": 0.306640625, "grad_norm_var": 0.00036900838216145834, "learning_rate": 0.0001, "loss": 1.4522, "loss/crossentropy": 2.474578380584717, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20218515396118164, "step": 12593 }, { "epoch": 0.18805575672507635, "grad_norm": 0.30078125, "grad_norm_var": 0.00035069783528645835, "learning_rate": 0.0001, "loss": 1.4619, "loss/crossentropy": 2.4631898403167725, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21976204961538315, "step": 12594 }, { "epoch": 0.1880706888956913, "grad_norm": 0.306640625, "grad_norm_var": 0.00028171539306640623, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.681929111480713, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21208231896162033, "step": 12595 }, { "epoch": 0.1880856210663063, "grad_norm": 0.26953125, "grad_norm_var": 0.00028076171875, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.21567964553833, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21805042028427124, "step": 12596 }, { "epoch": 0.1881005532369213, "grad_norm": 0.291015625, "grad_norm_var": 0.0002451419830322266, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.6694071292877197, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20597922801971436, "step": 12597 }, { "epoch": 0.18811548540753625, "grad_norm": 0.28515625, "grad_norm_var": 0.0002554416656494141, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.638842821121216, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2306593954563141, "step": 12598 }, { "epoch": 0.18813041757815124, "grad_norm": 0.3046875, "grad_norm_var": 0.00023407936096191405, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.555083155632019, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.209641233086586, "step": 12599 }, { "epoch": 0.18814534974876623, "grad_norm": 0.31640625, "grad_norm_var": 0.00025882720947265623, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.368828535079956, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.17088573426008224, "step": 12600 }, { "epoch": 0.18816028191938122, "grad_norm": 0.287109375, "grad_norm_var": 0.00026154518127441406, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.2742373943328857, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2476811707019806, "step": 12601 }, { "epoch": 0.18817521408999618, "grad_norm": 0.271484375, "grad_norm_var": 0.00021271705627441407, "learning_rate": 0.0001, "loss": 1.2243, "loss/crossentropy": 2.4159951210021973, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.1501147672533989, "step": 12602 }, { "epoch": 0.18819014626061117, "grad_norm": 0.265625, "grad_norm_var": 0.00027337074279785154, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.7158961296081543, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20208155363798141, "step": 12603 }, { "epoch": 0.18820507843122616, "grad_norm": 0.33203125, "grad_norm_var": 0.00034662882486979164, "learning_rate": 0.0001, "loss": 1.4574, "loss/crossentropy": 2.343722939491272, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.1878591924905777, "step": 12604 }, { "epoch": 0.18822001060184113, "grad_norm": 0.45703125, "grad_norm_var": 0.0019510269165039062, "learning_rate": 0.0001, "loss": 1.8577, "loss/crossentropy": 2.4074825048446655, "loss/fcd": 1.54296875, "loss/idx": 11.0, "loss/logits": 0.31474269181489944, "step": 12605 }, { "epoch": 0.18823494277245612, "grad_norm": 0.2734375, "grad_norm_var": 0.001995197931925456, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.82666552066803, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.18053391575813293, "step": 12606 }, { "epoch": 0.1882498749430711, "grad_norm": 0.287109375, "grad_norm_var": 0.0020132541656494142, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.630954146385193, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1806979700922966, "step": 12607 }, { "epoch": 0.1882648071136861, "grad_norm": 0.294921875, "grad_norm_var": 0.0020132541656494142, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.3652461767196655, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18665114790201187, "step": 12608 }, { "epoch": 0.18827973928430106, "grad_norm": 0.28125, "grad_norm_var": 0.0020415623982747394, "learning_rate": 0.0001, "loss": 1.2686, "loss/crossentropy": 2.488915205001831, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.15923890471458435, "step": 12609 }, { "epoch": 0.18829467145491605, "grad_norm": 0.3125, "grad_norm_var": 0.0020490010579427083, "learning_rate": 0.0001, "loss": 1.3727, "loss/crossentropy": 2.741648316383362, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18912610411643982, "step": 12610 }, { "epoch": 0.18830960362553104, "grad_norm": 0.2734375, "grad_norm_var": 0.0020984490712483723, "learning_rate": 0.0001, "loss": 1.2951, "loss/crossentropy": 2.423986554145813, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16226623952388763, "step": 12611 }, { "epoch": 0.188324535796146, "grad_norm": 0.30859375, "grad_norm_var": 0.0020342350006103517, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.5647960901260376, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20234104990959167, "step": 12612 }, { "epoch": 0.188339467966761, "grad_norm": 0.32421875, "grad_norm_var": 0.0020517985026041667, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.866174817085266, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2041333019733429, "step": 12613 }, { "epoch": 0.18835440013737598, "grad_norm": 0.27734375, "grad_norm_var": 0.002075958251953125, "learning_rate": 0.0001, "loss": 1.3975, "loss/crossentropy": 2.605584979057312, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2022245228290558, "step": 12614 }, { "epoch": 0.18836933230799094, "grad_norm": 0.5859375, "grad_norm_var": 0.007038116455078125, "learning_rate": 0.0001, "loss": 1.7518, "loss/crossentropy": 2.712088942527771, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.24793290346860886, "step": 12615 }, { "epoch": 0.18838426447860593, "grad_norm": 0.322265625, "grad_norm_var": 0.007036066055297852, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.9424219131469727, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20721519738435745, "step": 12616 }, { "epoch": 0.18839919664922092, "grad_norm": 0.28515625, "grad_norm_var": 0.007045427958170573, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.7307803630828857, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18767988681793213, "step": 12617 }, { "epoch": 0.1884141288198359, "grad_norm": 0.314453125, "grad_norm_var": 0.006871287027994792, "learning_rate": 0.0001, "loss": 1.4156, "loss/crossentropy": 2.7617284059524536, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.189076229929924, "step": 12618 }, { "epoch": 0.18842906099045087, "grad_norm": 0.32421875, "grad_norm_var": 0.006624285380045573, "learning_rate": 0.0001, "loss": 1.4187, "loss/crossentropy": 2.5339771509170532, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1999758780002594, "step": 12619 }, { "epoch": 0.18844399316106586, "grad_norm": 0.31640625, "grad_norm_var": 0.006631914774576823, "learning_rate": 0.0001, "loss": 1.4418, "loss/crossentropy": 2.5267937183380127, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2152276560664177, "step": 12620 }, { "epoch": 0.18845892533168085, "grad_norm": 0.283203125, "grad_norm_var": 0.005515782038370768, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.453012704849243, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18351102620363235, "step": 12621 }, { "epoch": 0.1884738575022958, "grad_norm": 0.298828125, "grad_norm_var": 0.005410194396972656, "learning_rate": 0.0001, "loss": 1.5058, "loss/crossentropy": 2.3846189975738525, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21676474064588547, "step": 12622 }, { "epoch": 0.1884887896729108, "grad_norm": 0.302734375, "grad_norm_var": 0.005360857645670573, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.348533034324646, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.20734970271587372, "step": 12623 }, { "epoch": 0.1885037218435258, "grad_norm": 0.294921875, "grad_norm_var": 0.005360857645670573, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 2.8227473497390747, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17784085869789124, "step": 12624 }, { "epoch": 0.18851865401414075, "grad_norm": 0.314453125, "grad_norm_var": 0.005262231826782227, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.3745259046554565, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.20519593358039856, "step": 12625 }, { "epoch": 0.18853358618475574, "grad_norm": 0.298828125, "grad_norm_var": 0.005289713541666667, "learning_rate": 0.0001, "loss": 1.4414, "loss/crossentropy": 2.600179672241211, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2070627063512802, "step": 12626 }, { "epoch": 0.18854851835537073, "grad_norm": 0.31640625, "grad_norm_var": 0.005136553446451823, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.709912896156311, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21452346444129944, "step": 12627 }, { "epoch": 0.18856345052598572, "grad_norm": 0.322265625, "grad_norm_var": 0.005121978123982748, "learning_rate": 0.0001, "loss": 1.431, "loss/crossentropy": 2.681355118751526, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21221653372049332, "step": 12628 }, { "epoch": 0.18857838269660068, "grad_norm": 0.279296875, "grad_norm_var": 0.00524590810139974, "learning_rate": 0.0001, "loss": 1.3806, "loss/crossentropy": 2.8404645919799805, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19311488419771194, "step": 12629 }, { "epoch": 0.18859331486721567, "grad_norm": 0.302734375, "grad_norm_var": 0.005138254165649414, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.604079842567444, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1956317126750946, "step": 12630 }, { "epoch": 0.18860824703783066, "grad_norm": 0.333984375, "grad_norm_var": 0.00026035308837890625, "learning_rate": 0.0001, "loss": 1.4086, "loss/crossentropy": 2.6660157442092896, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2093975841999054, "step": 12631 }, { "epoch": 0.18862317920844562, "grad_norm": 0.271484375, "grad_norm_var": 0.0003173828125, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.328666567802429, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17888426780700684, "step": 12632 }, { "epoch": 0.1886381113790606, "grad_norm": 0.310546875, "grad_norm_var": 0.0002948602040608724, "learning_rate": 0.0001, "loss": 1.3445, "loss/crossentropy": 2.55918025970459, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17650799453258514, "step": 12633 }, { "epoch": 0.1886530435496756, "grad_norm": 0.326171875, "grad_norm_var": 0.0003177483876546224, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.4839601516723633, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19878139346837997, "step": 12634 }, { "epoch": 0.1886679757202906, "grad_norm": 0.291015625, "grad_norm_var": 0.00030612945556640625, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.7571383714675903, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20777677744627, "step": 12635 }, { "epoch": 0.18868290789090555, "grad_norm": 0.337890625, "grad_norm_var": 0.00037064552307128904, "learning_rate": 0.0001, "loss": 1.4627, "loss/crossentropy": 2.501809239387512, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20101453363895416, "step": 12636 }, { "epoch": 0.18869784006152054, "grad_norm": 0.28125, "grad_norm_var": 0.00037663777669270834, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.64617121219635, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20406001806259155, "step": 12637 }, { "epoch": 0.18871277223213553, "grad_norm": 0.298828125, "grad_norm_var": 0.00037663777669270834, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.64902126789093, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20410769432783127, "step": 12638 }, { "epoch": 0.1887277044027505, "grad_norm": 0.5703125, "grad_norm_var": 0.004764413833618164, "learning_rate": 0.0001, "loss": 1.5908, "loss/crossentropy": 1.9980601072311401, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.18068242818117142, "step": 12639 }, { "epoch": 0.18874263657336549, "grad_norm": 0.365234375, "grad_norm_var": 0.004820489883422851, "learning_rate": 0.0001, "loss": 1.6512, "loss/crossentropy": 2.3734508752822876, "loss/fcd": 1.4140625, "loss/idx": 11.0, "loss/logits": 0.23709262907505035, "step": 12640 }, { "epoch": 0.18875756874398048, "grad_norm": 0.32421875, "grad_norm_var": 0.00481103261311849, "learning_rate": 0.0001, "loss": 1.4801, "loss/crossentropy": 2.608569622039795, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20666644722223282, "step": 12641 }, { "epoch": 0.18877250091459544, "grad_norm": 0.291015625, "grad_norm_var": 0.004844093322753906, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.5263575315475464, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18900475651025772, "step": 12642 }, { "epoch": 0.18878743308521043, "grad_norm": 0.3046875, "grad_norm_var": 0.004868316650390625, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.8444981575012207, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19487716257572174, "step": 12643 }, { "epoch": 0.18880236525582542, "grad_norm": 0.29296875, "grad_norm_var": 0.004935312271118164, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.592341661453247, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2305217757821083, "step": 12644 }, { "epoch": 0.1888172974264404, "grad_norm": 0.275390625, "grad_norm_var": 0.004959472020467122, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.5896955728530884, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17629560828208923, "step": 12645 }, { "epoch": 0.18883222959705537, "grad_norm": 0.283203125, "grad_norm_var": 0.005037673314412435, "learning_rate": 0.0001, "loss": 1.2968, "loss/crossentropy": 2.5486265420913696, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1718035340309143, "step": 12646 }, { "epoch": 0.18884716176767036, "grad_norm": 0.32421875, "grad_norm_var": 0.005028533935546875, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.423814535140991, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.223480686545372, "step": 12647 }, { "epoch": 0.18886209393828535, "grad_norm": 0.2734375, "grad_norm_var": 0.005015675226847331, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.6105252504348755, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17548351734876633, "step": 12648 }, { "epoch": 0.1888770261089003, "grad_norm": 0.263671875, "grad_norm_var": 0.005223957697550455, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.302902340888977, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18647215515375137, "step": 12649 }, { "epoch": 0.1888919582795153, "grad_norm": 0.34765625, "grad_norm_var": 0.0052734375, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.5322312116622925, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20907829701900482, "step": 12650 }, { "epoch": 0.1889068904501303, "grad_norm": 0.376953125, "grad_norm_var": 0.005399322509765625, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.5273555517196655, "loss/fcd": 1.37109375, "loss/idx": 11.0, "loss/logits": 0.23302268981933594, "step": 12651 }, { "epoch": 0.18892182262074528, "grad_norm": 0.294921875, "grad_norm_var": 0.005444780985514323, "learning_rate": 0.0001, "loss": 1.5641, "loss/crossentropy": 2.233674943447113, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.23207194358110428, "step": 12652 }, { "epoch": 0.18893675479136024, "grad_norm": 0.349609375, "grad_norm_var": 0.005356327692667643, "learning_rate": 0.0001, "loss": 1.5262, "loss/crossentropy": 2.6780463457107544, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2331964075565338, "step": 12653 }, { "epoch": 0.18895168696197523, "grad_norm": 0.271484375, "grad_norm_var": 0.005506753921508789, "learning_rate": 0.0001, "loss": 1.3783, "loss/crossentropy": 2.525170087814331, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.20256318897008896, "step": 12654 }, { "epoch": 0.18896661913259022, "grad_norm": 0.314453125, "grad_norm_var": 0.0012486775716145833, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.5086055994033813, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2021011859178543, "step": 12655 }, { "epoch": 0.18898155130320518, "grad_norm": 0.294921875, "grad_norm_var": 0.0010358174641927084, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.72516131401062, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19264177232980728, "step": 12656 }, { "epoch": 0.18899648347382017, "grad_norm": 0.287109375, "grad_norm_var": 0.0010276635487874348, "learning_rate": 0.0001, "loss": 1.4169, "loss/crossentropy": 2.5717965364456177, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19032957404851913, "step": 12657 }, { "epoch": 0.18901141564443516, "grad_norm": 0.380859375, "grad_norm_var": 0.0013903141021728515, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.8769381046295166, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2104574292898178, "step": 12658 }, { "epoch": 0.18902634781505012, "grad_norm": 0.2890625, "grad_norm_var": 0.0014134565989176431, "learning_rate": 0.0001, "loss": 1.4314, "loss/crossentropy": 2.473752021789551, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20097971707582474, "step": 12659 }, { "epoch": 0.1890412799856651, "grad_norm": 0.296875, "grad_norm_var": 0.00140684445699056, "learning_rate": 0.0001, "loss": 1.3836, "loss/crossentropy": 2.3238173723220825, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18439137935638428, "step": 12660 }, { "epoch": 0.1890562121562801, "grad_norm": 0.337890625, "grad_norm_var": 0.0013814131418863932, "learning_rate": 0.0001, "loss": 1.5516, "loss/crossentropy": 2.4911588430404663, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.246914803981781, "step": 12661 }, { "epoch": 0.1890711443268951, "grad_norm": 0.28515625, "grad_norm_var": 0.0013742446899414062, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.514091968536377, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21331837028265, "step": 12662 }, { "epoch": 0.18908607649751005, "grad_norm": 0.30859375, "grad_norm_var": 0.0013635635375976562, "learning_rate": 0.0001, "loss": 1.3782, "loss/crossentropy": 2.72454035282135, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.194560706615448, "step": 12663 }, { "epoch": 0.18910100866812504, "grad_norm": 0.380859375, "grad_norm_var": 0.0015497684478759765, "learning_rate": 0.0001, "loss": 1.76, "loss/crossentropy": 2.5094656944274902, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.30299703776836395, "step": 12664 }, { "epoch": 0.18911594083874003, "grad_norm": 0.35546875, "grad_norm_var": 0.00141754150390625, "learning_rate": 0.0001, "loss": 1.6241, "loss/crossentropy": 2.467388868331909, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.22961527109146118, "step": 12665 }, { "epoch": 0.189130873009355, "grad_norm": 0.369140625, "grad_norm_var": 0.0015163262685139974, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.584117889404297, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2312898486852646, "step": 12666 }, { "epoch": 0.18914580517996998, "grad_norm": 0.267578125, "grad_norm_var": 0.0015003045399983725, "learning_rate": 0.0001, "loss": 1.2906, "loss/crossentropy": 2.471327543258667, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16556258499622345, "step": 12667 }, { "epoch": 0.18916073735058497, "grad_norm": 0.265625, "grad_norm_var": 0.0016431172688802084, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.418222427368164, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17544469982385635, "step": 12668 }, { "epoch": 0.18917566952119996, "grad_norm": 0.361328125, "grad_norm_var": 0.0017043431599934897, "learning_rate": 0.0001, "loss": 1.6081, "loss/crossentropy": 2.4512813091278076, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.24869558215141296, "step": 12669 }, { "epoch": 0.18919060169181492, "grad_norm": 0.3828125, "grad_norm_var": 0.0018085320790608725, "learning_rate": 0.0001, "loss": 1.4893, "loss/crossentropy": 2.7935914993286133, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20415565371513367, "step": 12670 }, { "epoch": 0.18920553386242991, "grad_norm": 0.267578125, "grad_norm_var": 0.0020030816396077473, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.50970196723938, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.17945589870214462, "step": 12671 }, { "epoch": 0.1892204660330449, "grad_norm": 0.3359375, "grad_norm_var": 0.0019673665364583334, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.6527949571609497, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20192740857601166, "step": 12672 }, { "epoch": 0.18923539820365987, "grad_norm": 0.7265625, "grad_norm_var": 0.011920150121053059, "learning_rate": 0.0001, "loss": 1.6323, "loss/crossentropy": 2.6672648191452026, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2494981586933136, "step": 12673 }, { "epoch": 0.18925033037427486, "grad_norm": 0.365234375, "grad_norm_var": 0.011872593561808269, "learning_rate": 0.0001, "loss": 1.597, "loss/crossentropy": 2.6792668104171753, "loss/fcd": 1.375, "loss/idx": 11.0, "loss/logits": 0.22201433032751083, "step": 12674 }, { "epoch": 0.18926526254488985, "grad_norm": 0.283203125, "grad_norm_var": 0.011922136942545573, "learning_rate": 0.0001, "loss": 1.4519, "loss/crossentropy": 2.464362621307373, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21756327152252197, "step": 12675 }, { "epoch": 0.1892801947155048, "grad_norm": 0.34765625, "grad_norm_var": 0.0117279052734375, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.6837908029556274, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2373720332980156, "step": 12676 }, { "epoch": 0.1892951268861198, "grad_norm": 0.65625, "grad_norm_var": 0.017440652847290038, "learning_rate": 0.0001, "loss": 1.8369, "loss/crossentropy": 2.8282153606414795, "loss/fcd": 1.46875, "loss/idx": 11.0, "loss/logits": 0.36812354624271393, "step": 12677 }, { "epoch": 0.1893100590567348, "grad_norm": 0.2734375, "grad_norm_var": 0.017585611343383788, "learning_rate": 0.0001, "loss": 1.3683, "loss/crossentropy": 2.3925225734710693, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.20427284389734268, "step": 12678 }, { "epoch": 0.18932499122734978, "grad_norm": 0.388671875, "grad_norm_var": 0.01731255849202474, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.766049027442932, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18949805945158005, "step": 12679 }, { "epoch": 0.18933992339796474, "grad_norm": 0.28515625, "grad_norm_var": 0.017832040786743164, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.803817391395569, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17028261721134186, "step": 12680 }, { "epoch": 0.18935485556857973, "grad_norm": 0.330078125, "grad_norm_var": 0.017923990885416668, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.338342070579529, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19116852432489395, "step": 12681 }, { "epoch": 0.18936978773919472, "grad_norm": 0.31640625, "grad_norm_var": 0.018097798029581707, "learning_rate": 0.0001, "loss": 1.4394, "loss/crossentropy": 2.59196674823761, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20116712898015976, "step": 12682 }, { "epoch": 0.18938471990980968, "grad_norm": 0.310546875, "grad_norm_var": 0.01765020688374837, "learning_rate": 0.0001, "loss": 1.6709, "loss/crossentropy": 2.114510178565979, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.2685401439666748, "step": 12683 }, { "epoch": 0.18939965208042467, "grad_norm": 0.31640625, "grad_norm_var": 0.017114623387654623, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.5923218727111816, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1706830859184265, "step": 12684 }, { "epoch": 0.18941458425103966, "grad_norm": 0.388671875, "grad_norm_var": 0.01712352434794108, "learning_rate": 0.0001, "loss": 1.3877, "loss/crossentropy": 2.569551110267639, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.16508998721837997, "step": 12685 }, { "epoch": 0.18942951642165462, "grad_norm": 0.28125, "grad_norm_var": 0.01764092445373535, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 2.6815680265426636, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1733708530664444, "step": 12686 }, { "epoch": 0.1894444485922696, "grad_norm": 0.349609375, "grad_norm_var": 0.016973352432250975, "learning_rate": 0.0001, "loss": 1.6445, "loss/crossentropy": 2.550514817237854, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.26173035800457, "step": 12687 }, { "epoch": 0.1894593807628846, "grad_norm": 0.275390625, "grad_norm_var": 0.017495155334472656, "learning_rate": 0.0001, "loss": 1.3442, "loss/crossentropy": 2.5071544647216797, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17622895538806915, "step": 12688 }, { "epoch": 0.1894743129334996, "grad_norm": 0.279296875, "grad_norm_var": 0.008639383316040038, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.6183958053588867, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18675338476896286, "step": 12689 }, { "epoch": 0.18948924510411455, "grad_norm": 0.2890625, "grad_norm_var": 0.008750343322753906, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.2743067741394043, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17397211492061615, "step": 12690 }, { "epoch": 0.18950417727472954, "grad_norm": 0.27734375, "grad_norm_var": 0.008793497085571289, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.5346776247024536, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2052377387881279, "step": 12691 }, { "epoch": 0.18951910944534453, "grad_norm": 0.296875, "grad_norm_var": 0.008871189753214518, "learning_rate": 0.0001, "loss": 1.4814, "loss/crossentropy": 2.5875920057296753, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20406991988420486, "step": 12692 }, { "epoch": 0.1895340416159595, "grad_norm": 0.275390625, "grad_norm_var": 0.0014790217081705728, "learning_rate": 0.0001, "loss": 1.2784, "loss/crossentropy": 2.443010091781616, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.17289094626903534, "step": 12693 }, { "epoch": 0.18954897378657448, "grad_norm": 0.33984375, "grad_norm_var": 0.0014455159505208333, "learning_rate": 0.0001, "loss": 1.6981, "loss/crossentropy": 2.6169689893722534, "loss/fcd": 1.41796875, "loss/idx": 11.0, "loss/logits": 0.2801508158445358, "step": 12694 }, { "epoch": 0.18956390595718947, "grad_norm": 0.328125, "grad_norm_var": 0.001059707005818685, "learning_rate": 0.0001, "loss": 1.5146, "loss/crossentropy": 2.5953948497772217, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2099415808916092, "step": 12695 }, { "epoch": 0.18957883812780446, "grad_norm": 0.3046875, "grad_norm_var": 0.001022195816040039, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.607097029685974, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20936405658721924, "step": 12696 }, { "epoch": 0.18959377029841942, "grad_norm": 0.421875, "grad_norm_var": 0.0017953872680664062, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.543352961540222, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.2040601447224617, "step": 12697 }, { "epoch": 0.1896087024690344, "grad_norm": 0.279296875, "grad_norm_var": 0.0018778324127197265, "learning_rate": 0.0001, "loss": 1.3313, "loss/crossentropy": 2.6242899894714355, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1789778545498848, "step": 12698 }, { "epoch": 0.1896236346396494, "grad_norm": 0.3046875, "grad_norm_var": 0.001882171630859375, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.6024436950683594, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20499098300933838, "step": 12699 }, { "epoch": 0.18963856681026436, "grad_norm": 0.31640625, "grad_norm_var": 0.001882171630859375, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.5461448431015015, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19347435235977173, "step": 12700 }, { "epoch": 0.18965349898087935, "grad_norm": 0.50390625, "grad_norm_var": 0.003874953587849935, "learning_rate": 0.0001, "loss": 1.5995, "loss/crossentropy": 2.4926319122314453, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.23625487089157104, "step": 12701 }, { "epoch": 0.18966843115149434, "grad_norm": 0.30078125, "grad_norm_var": 0.0037973880767822265, "learning_rate": 0.0001, "loss": 1.4435, "loss/crossentropy": 2.6879128217697144, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.23256275057792664, "step": 12702 }, { "epoch": 0.1896833633221093, "grad_norm": 0.26171875, "grad_norm_var": 0.003949737548828125, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.598409056663513, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18400582671165466, "step": 12703 }, { "epoch": 0.1896982954927243, "grad_norm": 0.275390625, "grad_norm_var": 0.003949737548828125, "learning_rate": 0.0001, "loss": 1.2917, "loss/crossentropy": 2.5249701738357544, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17450562119483948, "step": 12704 }, { "epoch": 0.18971322766333928, "grad_norm": 0.275390625, "grad_norm_var": 0.003969764709472657, "learning_rate": 0.0001, "loss": 1.3348, "loss/crossentropy": 2.6939685344696045, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18246030062437057, "step": 12705 }, { "epoch": 0.18972815983395427, "grad_norm": 0.27734375, "grad_norm_var": 0.004019927978515625, "learning_rate": 0.0001, "loss": 1.3686, "loss/crossentropy": 2.771973490715027, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19281524419784546, "step": 12706 }, { "epoch": 0.18974309200456924, "grad_norm": 0.287109375, "grad_norm_var": 0.003976933161417643, "learning_rate": 0.0001, "loss": 1.435, "loss/crossentropy": 2.5653231143951416, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2084345445036888, "step": 12707 }, { "epoch": 0.18975802417518423, "grad_norm": 0.259765625, "grad_norm_var": 0.004155413309733073, "learning_rate": 0.0001, "loss": 1.3407, "loss/crossentropy": 2.6210235357284546, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1766408532857895, "step": 12708 }, { "epoch": 0.18977295634579922, "grad_norm": 0.373046875, "grad_norm_var": 0.00425872802734375, "learning_rate": 0.0001, "loss": 1.8854, "loss/crossentropy": 2.27763831615448, "loss/fcd": 1.5859375, "loss/idx": 11.0, "loss/logits": 0.29941989481449127, "step": 12709 }, { "epoch": 0.18978788851641418, "grad_norm": 0.345703125, "grad_norm_var": 0.004276895523071289, "learning_rate": 0.0001, "loss": 1.5168, "loss/crossentropy": 2.3665835857391357, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.20824569463729858, "step": 12710 }, { "epoch": 0.18980282068702917, "grad_norm": 0.337890625, "grad_norm_var": 0.0042938232421875, "learning_rate": 0.0001, "loss": 1.5755, "loss/crossentropy": 2.705082416534424, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.22397802770137787, "step": 12711 }, { "epoch": 0.18981775285764416, "grad_norm": 0.3046875, "grad_norm_var": 0.0042938232421875, "learning_rate": 0.0001, "loss": 1.36, "loss/crossentropy": 2.6331796646118164, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1920689418911934, "step": 12712 }, { "epoch": 0.18983268502825915, "grad_norm": 0.384765625, "grad_norm_var": 0.0038773695627848308, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.495506763458252, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.24055521190166473, "step": 12713 }, { "epoch": 0.1898476171988741, "grad_norm": 0.31640625, "grad_norm_var": 0.00377197265625, "learning_rate": 0.0001, "loss": 1.4375, "loss/crossentropy": 2.4686273336410522, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.18747781962156296, "step": 12714 }, { "epoch": 0.1898625493694891, "grad_norm": 0.291015625, "grad_norm_var": 0.003812138239542643, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.652945399284363, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17981933802366257, "step": 12715 }, { "epoch": 0.1898774815401041, "grad_norm": 0.333984375, "grad_norm_var": 0.003824297587076823, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.572095513343811, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.21311718970537186, "step": 12716 }, { "epoch": 0.18989241371071905, "grad_norm": 0.30859375, "grad_norm_var": 0.0014337539672851563, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.5237449407577515, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22287149727344513, "step": 12717 }, { "epoch": 0.18990734588133404, "grad_norm": 0.306640625, "grad_norm_var": 0.0014299869537353516, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.6148622035980225, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1824725866317749, "step": 12718 }, { "epoch": 0.18992227805194903, "grad_norm": 0.294921875, "grad_norm_var": 0.0012908299763997396, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.570311427116394, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18689439445734024, "step": 12719 }, { "epoch": 0.189937210222564, "grad_norm": 0.294921875, "grad_norm_var": 0.0012224833170572916, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.5598844289779663, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17589301615953445, "step": 12720 }, { "epoch": 0.18995214239317898, "grad_norm": 0.291015625, "grad_norm_var": 0.0011614481608072916, "learning_rate": 0.0001, "loss": 1.4548, "loss/crossentropy": 2.6292502880096436, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2125696837902069, "step": 12721 }, { "epoch": 0.18996707456379397, "grad_norm": 0.291015625, "grad_norm_var": 0.0011081536610921224, "learning_rate": 0.0001, "loss": 1.3636, "loss/crossentropy": 2.5103180408477783, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18395128101110458, "step": 12722 }, { "epoch": 0.18998200673440896, "grad_norm": 0.326171875, "grad_norm_var": 0.001064284642537435, "learning_rate": 0.0001, "loss": 1.551, "loss/crossentropy": 2.882192015647888, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22682493925094604, "step": 12723 }, { "epoch": 0.18999693890502392, "grad_norm": 0.275390625, "grad_norm_var": 0.0009617964426676432, "learning_rate": 0.0001, "loss": 1.4129, "loss/crossentropy": 2.6546534299850464, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.21760638058185577, "step": 12724 }, { "epoch": 0.1900118710756389, "grad_norm": 0.294921875, "grad_norm_var": 0.0007621606190999348, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.4660173654556274, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.18408038467168808, "step": 12725 }, { "epoch": 0.1900268032462539, "grad_norm": 0.31640625, "grad_norm_var": 0.0006856282552083333, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.3659387826919556, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20793625712394714, "step": 12726 }, { "epoch": 0.19004173541686886, "grad_norm": 0.314453125, "grad_norm_var": 0.0006345113118489584, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.5860944986343384, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1900653839111328, "step": 12727 }, { "epoch": 0.19005666758748385, "grad_norm": 0.3359375, "grad_norm_var": 0.0006772359212239583, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.6685538291931152, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20252443104982376, "step": 12728 }, { "epoch": 0.19007159975809884, "grad_norm": 0.314453125, "grad_norm_var": 0.0002950032552083333, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.608553647994995, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21149401366710663, "step": 12729 }, { "epoch": 0.19008653192871383, "grad_norm": 0.330078125, "grad_norm_var": 0.00032448768615722656, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.591108798980713, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22351718693971634, "step": 12730 }, { "epoch": 0.1901014640993288, "grad_norm": 0.30859375, "grad_norm_var": 0.00030517578125, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.378714919090271, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1992558166384697, "step": 12731 }, { "epoch": 0.19011639626994378, "grad_norm": 0.2734375, "grad_norm_var": 0.0003293196360270182, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.598193883895874, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18461225926876068, "step": 12732 }, { "epoch": 0.19013132844055877, "grad_norm": 0.265625, "grad_norm_var": 0.0004230340321858724, "learning_rate": 0.0001, "loss": 1.3006, "loss/crossentropy": 2.542188048362732, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1794830709695816, "step": 12733 }, { "epoch": 0.19014626061117373, "grad_norm": 0.30078125, "grad_norm_var": 0.0004216512044270833, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.7273383140563965, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17739613354206085, "step": 12734 }, { "epoch": 0.19016119278178872, "grad_norm": 0.33203125, "grad_norm_var": 0.00047389666239420575, "learning_rate": 0.0001, "loss": 1.3746, "loss/crossentropy": 2.6007527112960815, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.17145608365535736, "step": 12735 }, { "epoch": 0.1901761249524037, "grad_norm": 0.32421875, "grad_norm_var": 0.0004917780558268229, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.541442036628723, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20702588558197021, "step": 12736 }, { "epoch": 0.19019105712301868, "grad_norm": 0.318359375, "grad_norm_var": 0.00048421223958333335, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.7088147401809692, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2087017148733139, "step": 12737 }, { "epoch": 0.19020598929363366, "grad_norm": 0.2890625, "grad_norm_var": 0.0004887739817301432, "learning_rate": 0.0001, "loss": 1.3022, "loss/crossentropy": 2.4820972681045532, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16547179222106934, "step": 12738 }, { "epoch": 0.19022092146424865, "grad_norm": 0.267578125, "grad_norm_var": 0.0005574385325113932, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.6602914333343506, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1729084700345993, "step": 12739 }, { "epoch": 0.19023585363486364, "grad_norm": 0.3203125, "grad_norm_var": 0.0005132039388020833, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.5237776041030884, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21124280244112015, "step": 12740 }, { "epoch": 0.1902507858054786, "grad_norm": 0.306640625, "grad_norm_var": 0.0005034764607747395, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.6757932901382446, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19938258081674576, "step": 12741 }, { "epoch": 0.1902657179760936, "grad_norm": 0.33984375, "grad_norm_var": 0.0005660374959309896, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.827696919441223, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.22481736540794373, "step": 12742 }, { "epoch": 0.19028065014670859, "grad_norm": 0.26953125, "grad_norm_var": 0.0006585280100504557, "learning_rate": 0.0001, "loss": 1.3226, "loss/crossentropy": 2.7207661867141724, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1702244132757187, "step": 12743 }, { "epoch": 0.19029558231732355, "grad_norm": 0.279296875, "grad_norm_var": 0.0006331761678059895, "learning_rate": 0.0001, "loss": 1.306, "loss/crossentropy": 2.457805037498474, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16537167876958847, "step": 12744 }, { "epoch": 0.19031051448793854, "grad_norm": 0.275390625, "grad_norm_var": 0.0006662368774414063, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.47430956363678, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18400227278470993, "step": 12745 }, { "epoch": 0.19032544665855353, "grad_norm": 0.287109375, "grad_norm_var": 0.000609588623046875, "learning_rate": 0.0001, "loss": 1.3337, "loss/crossentropy": 2.6995153427124023, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.16570252925157547, "step": 12746 }, { "epoch": 0.1903403788291685, "grad_norm": 0.314453125, "grad_norm_var": 0.0006205081939697265, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.6096839904785156, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19853170216083527, "step": 12747 }, { "epoch": 0.19035531099978348, "grad_norm": 0.29296875, "grad_norm_var": 0.0005810896555582683, "learning_rate": 0.0001, "loss": 1.4124, "loss/crossentropy": 2.70488965511322, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18977169692516327, "step": 12748 }, { "epoch": 0.19037024317039847, "grad_norm": 0.287109375, "grad_norm_var": 0.0005144755045572917, "learning_rate": 0.0001, "loss": 1.3124, "loss/crossentropy": 2.728072166442871, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17956437170505524, "step": 12749 }, { "epoch": 0.19038517534101346, "grad_norm": 0.33203125, "grad_norm_var": 0.000577545166015625, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.7292957305908203, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19867827743291855, "step": 12750 }, { "epoch": 0.19040010751162842, "grad_norm": 0.30859375, "grad_norm_var": 0.000518798828125, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.4774417877197266, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1971392184495926, "step": 12751 }, { "epoch": 0.1904150396822434, "grad_norm": 0.2890625, "grad_norm_var": 0.00048618316650390626, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.7697499990463257, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19613683223724365, "step": 12752 }, { "epoch": 0.1904299718528584, "grad_norm": 0.31640625, "grad_norm_var": 0.00048127174377441404, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.6044188737869263, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22331120073795319, "step": 12753 }, { "epoch": 0.19044490402347336, "grad_norm": 0.79296875, "grad_norm_var": 0.015719842910766602, "learning_rate": 0.0001, "loss": 2.0727, "loss/crossentropy": 2.5574734210968018, "loss/fcd": 1.68359375, "loss/idx": 11.0, "loss/logits": 0.38910040259361267, "step": 12754 }, { "epoch": 0.19045983619408835, "grad_norm": 0.33984375, "grad_norm_var": 0.015445200602213542, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.639474391937256, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19825097918510437, "step": 12755 }, { "epoch": 0.19047476836470334, "grad_norm": 0.421875, "grad_norm_var": 0.01589813232421875, "learning_rate": 0.0001, "loss": 1.5996, "loss/crossentropy": 2.600898027420044, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.21678845584392548, "step": 12756 }, { "epoch": 0.19048970053531833, "grad_norm": 0.37890625, "grad_norm_var": 0.01589519182840983, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.106062650680542, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.226033017039299, "step": 12757 }, { "epoch": 0.1905046327059333, "grad_norm": 0.5625, "grad_norm_var": 0.018830601374308267, "learning_rate": 0.0001, "loss": 1.8803, "loss/crossentropy": 2.766907572746277, "loss/fcd": 1.54296875, "loss/idx": 11.0, "loss/logits": 0.3373717814683914, "step": 12758 }, { "epoch": 0.19051956487654828, "grad_norm": 0.34375, "grad_norm_var": 0.018287007013956705, "learning_rate": 0.0001, "loss": 1.3576, "loss/crossentropy": 3.083821415901184, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18576475232839584, "step": 12759 }, { "epoch": 0.19053449704716327, "grad_norm": 0.3125, "grad_norm_var": 0.01798140207926432, "learning_rate": 0.0001, "loss": 1.4964, "loss/crossentropy": 2.482161045074463, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21909164637327194, "step": 12760 }, { "epoch": 0.19054942921777823, "grad_norm": 0.29296875, "grad_norm_var": 0.017788426081339518, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.5907013416290283, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.21053585410118103, "step": 12761 }, { "epoch": 0.19056436138839322, "grad_norm": 0.5234375, "grad_norm_var": 0.018759663899739584, "learning_rate": 0.0001, "loss": 1.6632, "loss/crossentropy": 2.3510771989822388, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.2296135276556015, "step": 12762 }, { "epoch": 0.1905792935590082, "grad_norm": 0.326171875, "grad_norm_var": 0.01866296132405599, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.5092803239822388, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19507446885108948, "step": 12763 }, { "epoch": 0.19059422572962317, "grad_norm": 0.32421875, "grad_norm_var": 0.018350664774576822, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.5225603580474854, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.24447999894618988, "step": 12764 }, { "epoch": 0.19060915790023816, "grad_norm": 0.283203125, "grad_norm_var": 0.01840235392252604, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.6274677515029907, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1873057559132576, "step": 12765 }, { "epoch": 0.19062409007085315, "grad_norm": 0.302734375, "grad_norm_var": 0.01866008440653483, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.666028141975403, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1723952665925026, "step": 12766 }, { "epoch": 0.19063902224146814, "grad_norm": 0.326171875, "grad_norm_var": 0.018506304423014323, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.7453049421310425, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19350632280111313, "step": 12767 }, { "epoch": 0.1906539544120831, "grad_norm": 0.314453125, "grad_norm_var": 0.01822673479715983, "learning_rate": 0.0001, "loss": 1.5883, "loss/crossentropy": 2.396880626678467, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.22502049058675766, "step": 12768 }, { "epoch": 0.1906688865826981, "grad_norm": 0.376953125, "grad_norm_var": 0.017901039123535155, "learning_rate": 0.0001, "loss": 1.5609, "loss/crossentropy": 2.2542881965637207, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23277593404054642, "step": 12769 }, { "epoch": 0.19068381875331308, "grad_norm": 0.310546875, "grad_norm_var": 0.0064568678538004555, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.4268730878829956, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19639290869235992, "step": 12770 }, { "epoch": 0.19069875092392805, "grad_norm": 0.287109375, "grad_norm_var": 0.0067637125651041664, "learning_rate": 0.0001, "loss": 1.3055, "loss/crossentropy": 2.3882412910461426, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1609908491373062, "step": 12771 }, { "epoch": 0.19071368309454304, "grad_norm": 1.1796875, "grad_norm_var": 0.049365997314453125, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.3797953128814697, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22336481511592865, "step": 12772 }, { "epoch": 0.19072861526515802, "grad_norm": 0.3203125, "grad_norm_var": 0.049767494201660156, "learning_rate": 0.0001, "loss": 1.5069, "loss/crossentropy": 2.399223208427429, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2178490087389946, "step": 12773 }, { "epoch": 0.19074354743577301, "grad_norm": 0.310546875, "grad_norm_var": 0.04824814796447754, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.729777216911316, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1864388808608055, "step": 12774 }, { "epoch": 0.19075847960638798, "grad_norm": 0.3046875, "grad_norm_var": 0.04855014483133952, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.5086376667022705, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2335895374417305, "step": 12775 }, { "epoch": 0.19077341177700297, "grad_norm": 0.26953125, "grad_norm_var": 0.0490578810373942, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.5649999380111694, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1792750060558319, "step": 12776 }, { "epoch": 0.19078834394761796, "grad_norm": 0.82421875, "grad_norm_var": 0.060653034845987955, "learning_rate": 0.0001, "loss": 1.7188, "loss/crossentropy": 2.4227280616760254, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.3125530928373337, "step": 12777 }, { "epoch": 0.19080327611823292, "grad_norm": 0.388671875, "grad_norm_var": 0.05977675120035807, "learning_rate": 0.0001, "loss": 1.4534, "loss/crossentropy": 2.6623990535736084, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.1995403841137886, "step": 12778 }, { "epoch": 0.1908182082888479, "grad_norm": 0.318359375, "grad_norm_var": 0.05986067454020182, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.6290453672409058, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16557537019252777, "step": 12779 }, { "epoch": 0.1908331404594629, "grad_norm": 0.2890625, "grad_norm_var": 0.060305277506510414, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5653408765792847, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20621013641357422, "step": 12780 }, { "epoch": 0.19084807263007786, "grad_norm": 0.279296875, "grad_norm_var": 0.06036726633707682, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.811257004737854, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17603185772895813, "step": 12781 }, { "epoch": 0.19086300480069285, "grad_norm": 0.490234375, "grad_norm_var": 0.06012922922770182, "learning_rate": 0.0001, "loss": 1.6731, "loss/crossentropy": 2.4288493394851685, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2278168797492981, "step": 12782 }, { "epoch": 0.19087793697130784, "grad_norm": 0.357421875, "grad_norm_var": 0.05983320871988932, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.192407727241516, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1607203334569931, "step": 12783 }, { "epoch": 0.19089286914192283, "grad_norm": 0.291015625, "grad_norm_var": 0.060178057352701826, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.7283806800842285, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1698060855269432, "step": 12784 }, { "epoch": 0.1909078013125378, "grad_norm": 0.26953125, "grad_norm_var": 0.06140631039937337, "learning_rate": 0.0001, "loss": 1.3296, "loss/crossentropy": 2.5817233324050903, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1928665041923523, "step": 12785 }, { "epoch": 0.19092273348315278, "grad_norm": 0.271484375, "grad_norm_var": 0.061996952692667646, "learning_rate": 0.0001, "loss": 1.2682, "loss/crossentropy": 2.5897973775863647, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.16663488000631332, "step": 12786 }, { "epoch": 0.19093766565376777, "grad_norm": 0.359375, "grad_norm_var": 0.061204783121744794, "learning_rate": 0.0001, "loss": 1.5238, "loss/crossentropy": 2.622979998588562, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2269698902964592, "step": 12787 }, { "epoch": 0.19095259782438273, "grad_norm": 0.306640625, "grad_norm_var": 0.01898051897684733, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.5313323736190796, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.19530098140239716, "step": 12788 }, { "epoch": 0.19096752999499772, "grad_norm": 0.5078125, "grad_norm_var": 0.02035686175028483, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.629228353500366, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.27004997432231903, "step": 12789 }, { "epoch": 0.1909824621656127, "grad_norm": 0.34375, "grad_norm_var": 0.020185279846191406, "learning_rate": 0.0001, "loss": 1.5673, "loss/crossentropy": 2.3543819189071655, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2196546196937561, "step": 12790 }, { "epoch": 0.1909973943362277, "grad_norm": 0.376953125, "grad_norm_var": 0.019911813735961913, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.2913018465042114, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.1909004971385002, "step": 12791 }, { "epoch": 0.19101232650684266, "grad_norm": 0.84765625, "grad_norm_var": 0.03294409116109212, "learning_rate": 0.0001, "loss": 1.479, "loss/crossentropy": 2.348365604877472, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.18602126836776733, "step": 12792 }, { "epoch": 0.19102725867745765, "grad_norm": 0.302734375, "grad_norm_var": 0.020972188313802084, "learning_rate": 0.0001, "loss": 1.4206, "loss/crossentropy": 2.568907141685486, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19798583537340164, "step": 12793 }, { "epoch": 0.19104219084807264, "grad_norm": 0.310546875, "grad_norm_var": 0.02121124267578125, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.864022970199585, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22454826533794403, "step": 12794 }, { "epoch": 0.1910571230186876, "grad_norm": 0.3203125, "grad_norm_var": 0.021198002497355144, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.8146337270736694, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21220187842845917, "step": 12795 }, { "epoch": 0.1910720551893026, "grad_norm": 0.322265625, "grad_norm_var": 0.02090752919514974, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.8697733879089355, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19964615255594254, "step": 12796 }, { "epoch": 0.19108698735991758, "grad_norm": 0.341796875, "grad_norm_var": 0.02037652333577474, "learning_rate": 0.0001, "loss": 1.5322, "loss/crossentropy": 2.7738330364227295, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22747942060232162, "step": 12797 }, { "epoch": 0.19110191953053254, "grad_norm": 0.2734375, "grad_norm_var": 0.020018370946248372, "learning_rate": 0.0001, "loss": 1.3975, "loss/crossentropy": 2.5328720808029175, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19438670575618744, "step": 12798 }, { "epoch": 0.19111685170114753, "grad_norm": 0.30078125, "grad_norm_var": 0.020258522033691405, "learning_rate": 0.0001, "loss": 1.2781, "loss/crossentropy": 2.7240511178970337, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.16486629843711853, "step": 12799 }, { "epoch": 0.19113178387176252, "grad_norm": 0.33203125, "grad_norm_var": 0.019991159439086914, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.4739092588424683, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18422887474298477, "step": 12800 }, { "epoch": 0.1911467160423775, "grad_norm": 0.31640625, "grad_norm_var": 0.01955246925354004, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.617809534072876, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19042599946260452, "step": 12801 }, { "epoch": 0.19116164821299247, "grad_norm": 0.3359375, "grad_norm_var": 0.019011688232421876, "learning_rate": 0.0001, "loss": 1.3975, "loss/crossentropy": 2.586477518081665, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1944154128432274, "step": 12802 }, { "epoch": 0.19117658038360746, "grad_norm": 0.306640625, "grad_norm_var": 0.0192507266998291, "learning_rate": 0.0001, "loss": 1.3263, "loss/crossentropy": 2.4031201601028442, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18179119378328323, "step": 12803 }, { "epoch": 0.19119151255422245, "grad_norm": 0.28515625, "grad_norm_var": 0.019447771708170573, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.7581961154937744, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19862715899944305, "step": 12804 }, { "epoch": 0.19120644472483742, "grad_norm": 0.2890625, "grad_norm_var": 0.018244361877441405, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.56661856174469, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.18458351492881775, "step": 12805 }, { "epoch": 0.1912213768954524, "grad_norm": 0.306640625, "grad_norm_var": 0.018363046646118163, "learning_rate": 0.0001, "loss": 1.4759, "loss/crossentropy": 2.615119218826294, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2259267196059227, "step": 12806 }, { "epoch": 0.1912363090660674, "grad_norm": 0.36328125, "grad_norm_var": 0.018321990966796875, "learning_rate": 0.0001, "loss": 1.5748, "loss/crossentropy": 2.607254147529602, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2271202802658081, "step": 12807 }, { "epoch": 0.19125124123668236, "grad_norm": 0.296875, "grad_norm_var": 0.0005273818969726562, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.580726981163025, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18637648224830627, "step": 12808 }, { "epoch": 0.19126617340729735, "grad_norm": 0.345703125, "grad_norm_var": 0.0005854288736979167, "learning_rate": 0.0001, "loss": 1.5642, "loss/crossentropy": 2.7642533779144287, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.22432173788547516, "step": 12809 }, { "epoch": 0.19128110557791234, "grad_norm": 0.55859375, "grad_norm_var": 0.00426939328511556, "learning_rate": 0.0001, "loss": 1.631, "loss/crossentropy": 2.4152939319610596, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.22866053879261017, "step": 12810 }, { "epoch": 0.19129603774852733, "grad_norm": 0.32421875, "grad_norm_var": 0.00426481564839681, "learning_rate": 0.0001, "loss": 1.3619, "loss/crossentropy": 2.698020100593567, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19005876779556274, "step": 12811 }, { "epoch": 0.1913109699191423, "grad_norm": 0.458984375, "grad_norm_var": 0.005270624160766601, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.2672455310821533, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19373402744531631, "step": 12812 }, { "epoch": 0.19132590208975728, "grad_norm": 0.3125, "grad_norm_var": 0.005316162109375, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.9636716842651367, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19711507111787796, "step": 12813 }, { "epoch": 0.19134083426037227, "grad_norm": 0.322265625, "grad_norm_var": 0.005045557022094726, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.5360301733016968, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1903536394238472, "step": 12814 }, { "epoch": 0.19135576643098723, "grad_norm": 0.294921875, "grad_norm_var": 0.005079078674316406, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.4468631744384766, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.23134148120880127, "step": 12815 }, { "epoch": 0.19137069860160222, "grad_norm": 0.33203125, "grad_norm_var": 0.005079078674316406, "learning_rate": 0.0001, "loss": 1.561, "loss/crossentropy": 2.6711761951446533, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2563386783003807, "step": 12816 }, { "epoch": 0.1913856307722172, "grad_norm": 0.310546875, "grad_norm_var": 0.005100107192993164, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.6536275148391724, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22271990776062012, "step": 12817 }, { "epoch": 0.1914005629428322, "grad_norm": 0.33984375, "grad_norm_var": 0.0050988356272379555, "learning_rate": 0.0001, "loss": 1.4442, "loss/crossentropy": 2.664584994316101, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1942397803068161, "step": 12818 }, { "epoch": 0.19141549511344716, "grad_norm": 0.306640625, "grad_norm_var": 0.0050988356272379555, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.7442792654037476, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19929665327072144, "step": 12819 }, { "epoch": 0.19143042728406215, "grad_norm": 0.30078125, "grad_norm_var": 0.00499889055887858, "learning_rate": 0.0001, "loss": 1.3493, "loss/crossentropy": 2.6480530500411987, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1890954077243805, "step": 12820 }, { "epoch": 0.19144535945467714, "grad_norm": 0.384765625, "grad_norm_var": 0.004903093973795573, "learning_rate": 0.0001, "loss": 1.5944, "loss/crossentropy": 2.4829975366592407, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.23497851938009262, "step": 12821 }, { "epoch": 0.1914602916252921, "grad_norm": 0.376953125, "grad_norm_var": 0.0048298517862955725, "learning_rate": 0.0001, "loss": 1.8781, "loss/crossentropy": 2.3358116149902344, "loss/fcd": 1.51953125, "loss/idx": 11.0, "loss/logits": 0.3585960119962692, "step": 12822 }, { "epoch": 0.1914752237959071, "grad_norm": 0.291015625, "grad_norm_var": 0.005045684178670248, "learning_rate": 0.0001, "loss": 1.2569, "loss/crossentropy": 2.4549028873443604, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.15141742676496506, "step": 12823 }, { "epoch": 0.19149015596652208, "grad_norm": 0.287109375, "grad_norm_var": 0.005117289225260417, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.554845690727234, "loss/fcd": 1.099609375, "loss/idx": 11.0, "loss/logits": 0.1713612750172615, "step": 12824 }, { "epoch": 0.19150508813713704, "grad_norm": 0.28125, "grad_norm_var": 0.0053853193918863935, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.4151159524917603, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1955929696559906, "step": 12825 }, { "epoch": 0.19152002030775203, "grad_norm": 0.33203125, "grad_norm_var": 0.0020702203114827474, "learning_rate": 0.0001, "loss": 1.3987, "loss/crossentropy": 2.835370182991028, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1955535039305687, "step": 12826 }, { "epoch": 0.19153495247836702, "grad_norm": 0.31640625, "grad_norm_var": 0.0020784854888916014, "learning_rate": 0.0001, "loss": 1.4242, "loss/crossentropy": 2.575809955596924, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20151927322149277, "step": 12827 }, { "epoch": 0.191549884648982, "grad_norm": 0.33984375, "grad_norm_var": 0.0008849461873372395, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.674934983253479, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21722020953893661, "step": 12828 }, { "epoch": 0.19156481681959697, "grad_norm": 0.296875, "grad_norm_var": 0.0009169896443684896, "learning_rate": 0.0001, "loss": 1.2962, "loss/crossentropy": 2.6621971130371094, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17900294065475464, "step": 12829 }, { "epoch": 0.19157974899021196, "grad_norm": 0.369140625, "grad_norm_var": 0.0010711034138997397, "learning_rate": 0.0001, "loss": 1.4179, "loss/crossentropy": 2.456557035446167, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1796012818813324, "step": 12830 }, { "epoch": 0.19159468116082695, "grad_norm": 0.29296875, "grad_norm_var": 0.0010785261789957683, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.5968575477600098, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1828969344496727, "step": 12831 }, { "epoch": 0.1916096133314419, "grad_norm": 0.443359375, "grad_norm_var": 0.001996294657389323, "learning_rate": 0.0001, "loss": 1.5582, "loss/crossentropy": 2.4895708560943604, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23008284717798233, "step": 12832 }, { "epoch": 0.1916245455020569, "grad_norm": 0.337890625, "grad_norm_var": 0.0019744873046875, "learning_rate": 0.0001, "loss": 1.4667, "loss/crossentropy": 2.5526612997055054, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22837141156196594, "step": 12833 }, { "epoch": 0.1916394776726719, "grad_norm": 0.333984375, "grad_norm_var": 0.001969766616821289, "learning_rate": 0.0001, "loss": 1.5473, "loss/crossentropy": 2.2668702602386475, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.23480592668056488, "step": 12834 }, { "epoch": 0.19165440984328688, "grad_norm": 0.4453125, "grad_norm_var": 0.0027269999186197917, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.5554975271224976, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2112528756260872, "step": 12835 }, { "epoch": 0.19166934201390184, "grad_norm": 0.326171875, "grad_norm_var": 0.00263670285542806, "learning_rate": 0.0001, "loss": 1.4206, "loss/crossentropy": 2.656300663948059, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20964600890874863, "step": 12836 }, { "epoch": 0.19168427418451683, "grad_norm": 0.31640625, "grad_norm_var": 0.002529335021972656, "learning_rate": 0.0001, "loss": 1.4536, "loss/crossentropy": 2.480362057685852, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19580398499965668, "step": 12837 }, { "epoch": 0.19169920635513182, "grad_norm": 0.298828125, "grad_norm_var": 0.002491188049316406, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.6283973455429077, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16822770982980728, "step": 12838 }, { "epoch": 0.19171413852574679, "grad_norm": 0.37890625, "grad_norm_var": 0.002496194839477539, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.410072684288025, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20617635548114777, "step": 12839 }, { "epoch": 0.19172907069636178, "grad_norm": 0.28515625, "grad_norm_var": 0.0025094985961914063, "learning_rate": 0.0001, "loss": 1.3518, "loss/crossentropy": 2.6216260194778442, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17995400726795197, "step": 12840 }, { "epoch": 0.19174400286697676, "grad_norm": 0.26953125, "grad_norm_var": 0.002605438232421875, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.6371275186538696, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1652621254324913, "step": 12841 }, { "epoch": 0.19175893503759173, "grad_norm": 0.298828125, "grad_norm_var": 0.002693796157836914, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.6642411947250366, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18592000752687454, "step": 12842 }, { "epoch": 0.19177386720820672, "grad_norm": 0.341796875, "grad_norm_var": 0.00267333984375, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.336825370788574, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.22593285143375397, "step": 12843 }, { "epoch": 0.1917887993788217, "grad_norm": 0.34765625, "grad_norm_var": 0.0026812235514322917, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.8548824787139893, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20039281994104385, "step": 12844 }, { "epoch": 0.1918037315494367, "grad_norm": 0.2890625, "grad_norm_var": 0.0027262369791666665, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.7431164979934692, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19121548533439636, "step": 12845 }, { "epoch": 0.19181866372005166, "grad_norm": 0.283203125, "grad_norm_var": 0.002807362874348958, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.726262331008911, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.189348466694355, "step": 12846 }, { "epoch": 0.19183359589066665, "grad_norm": 0.341796875, "grad_norm_var": 0.0027115980784098307, "learning_rate": 0.0001, "loss": 1.6676, "loss/crossentropy": 2.279746890068054, "loss/fcd": 1.42578125, "loss/idx": 11.0, "loss/logits": 0.24180921912193298, "step": 12847 }, { "epoch": 0.19184852806128164, "grad_norm": 0.287109375, "grad_norm_var": 0.0019512017567952474, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.5485633611679077, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1885678619146347, "step": 12848 }, { "epoch": 0.1918634602318966, "grad_norm": 0.326171875, "grad_norm_var": 0.00193785031636556, "learning_rate": 0.0001, "loss": 1.3882, "loss/crossentropy": 2.677897810935974, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19288334995508194, "step": 12849 }, { "epoch": 0.1918783924025116, "grad_norm": 0.33984375, "grad_norm_var": 0.0019484837849934897, "learning_rate": 0.0001, "loss": 1.4696, "loss/crossentropy": 2.671549677848816, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21182676404714584, "step": 12850 }, { "epoch": 0.19189332457312658, "grad_norm": 0.26953125, "grad_norm_var": 0.0010243733723958334, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.727570056915283, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19305531680583954, "step": 12851 }, { "epoch": 0.19190825674374157, "grad_norm": 0.3125, "grad_norm_var": 0.0010111331939697266, "learning_rate": 0.0001, "loss": 1.45, "loss/crossentropy": 2.22195827960968, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21561305969953537, "step": 12852 }, { "epoch": 0.19192318891435653, "grad_norm": 0.29296875, "grad_norm_var": 0.0010305881500244141, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.5548007488250732, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.18980161100625992, "step": 12853 }, { "epoch": 0.19193812108497152, "grad_norm": 0.310546875, "grad_norm_var": 0.001021432876586914, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.7408740520477295, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.22523800283670425, "step": 12854 }, { "epoch": 0.1919530532555865, "grad_norm": 0.2734375, "grad_norm_var": 0.0007605075836181641, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.460801124572754, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1806684508919716, "step": 12855 }, { "epoch": 0.19196798542620147, "grad_norm": 0.271484375, "grad_norm_var": 0.0008071263631184896, "learning_rate": 0.0001, "loss": 1.3269, "loss/crossentropy": 2.590912103652954, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1823333501815796, "step": 12856 }, { "epoch": 0.19198291759681646, "grad_norm": 0.28515625, "grad_norm_var": 0.0007516860961914063, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.5917892456054688, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.16504646837711334, "step": 12857 }, { "epoch": 0.19199784976743145, "grad_norm": 0.26953125, "grad_norm_var": 0.0008272647857666016, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.4458853006362915, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1803644448518753, "step": 12858 }, { "epoch": 0.1920127819380464, "grad_norm": 0.271484375, "grad_norm_var": 0.0007688999176025391, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.1856868863105774, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.20308878272771835, "step": 12859 }, { "epoch": 0.1920277141086614, "grad_norm": 0.310546875, "grad_norm_var": 0.0006103515625, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.7835229635238647, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20520994067192078, "step": 12860 }, { "epoch": 0.1920426462792764, "grad_norm": 0.326171875, "grad_norm_var": 0.0006625970204671224, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.6707780361175537, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.21358966082334518, "step": 12861 }, { "epoch": 0.19205757844989138, "grad_norm": 0.310546875, "grad_norm_var": 0.0006545861562093099, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.916717052459717, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21722666174173355, "step": 12862 }, { "epoch": 0.19207251062050634, "grad_norm": 0.2890625, "grad_norm_var": 0.0005339940388997395, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.7230807542800903, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17612324655056, "step": 12863 }, { "epoch": 0.19208744279112133, "grad_norm": 0.2890625, "grad_norm_var": 0.0005317529042561849, "learning_rate": 0.0001, "loss": 1.4227, "loss/crossentropy": 2.5650532245635986, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20394054800271988, "step": 12864 }, { "epoch": 0.19210237496173632, "grad_norm": 0.263671875, "grad_norm_var": 0.0005307356516520182, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.554872512817383, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18013563007116318, "step": 12865 }, { "epoch": 0.19211730713235128, "grad_norm": 0.3203125, "grad_norm_var": 0.0004321893056233724, "learning_rate": 0.0001, "loss": 1.7023, "loss/crossentropy": 3.018772840499878, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.26484619081020355, "step": 12866 }, { "epoch": 0.19213223930296627, "grad_norm": 0.298828125, "grad_norm_var": 0.00039952596028645834, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.840328335762024, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20206225663423538, "step": 12867 }, { "epoch": 0.19214717147358126, "grad_norm": 0.330078125, "grad_norm_var": 0.0004634698232014974, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.72058367729187, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19635310024023056, "step": 12868 }, { "epoch": 0.19216210364419622, "grad_norm": 0.31640625, "grad_norm_var": 0.0004928429921468099, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.642712354660034, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2102600783109665, "step": 12869 }, { "epoch": 0.19217703581481121, "grad_norm": 0.302734375, "grad_norm_var": 0.00048152605692545575, "learning_rate": 0.0001, "loss": 1.3603, "loss/crossentropy": 2.504085898399353, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17281575500965118, "step": 12870 }, { "epoch": 0.1921919679854262, "grad_norm": 0.279296875, "grad_norm_var": 0.00046641031901041666, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.51378858089447, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17794574052095413, "step": 12871 }, { "epoch": 0.1922069001560412, "grad_norm": 0.30859375, "grad_norm_var": 0.00043168067932128904, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.8445372581481934, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17498525977134705, "step": 12872 }, { "epoch": 0.19222183232665616, "grad_norm": 0.345703125, "grad_norm_var": 0.0005553563435872396, "learning_rate": 0.0001, "loss": 1.5553, "loss/crossentropy": 2.538027048110962, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23891737312078476, "step": 12873 }, { "epoch": 0.19223676449727115, "grad_norm": 0.30859375, "grad_norm_var": 0.00048160552978515625, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.4733537435531616, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20831362158060074, "step": 12874 }, { "epoch": 0.19225169666788613, "grad_norm": 0.283203125, "grad_norm_var": 0.000438690185546875, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.693610429763794, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17174659669399261, "step": 12875 }, { "epoch": 0.1922666288385011, "grad_norm": 0.294921875, "grad_norm_var": 0.00044275919596354165, "learning_rate": 0.0001, "loss": 1.345, "loss/crossentropy": 2.4802443981170654, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1848844662308693, "step": 12876 }, { "epoch": 0.1922815610091161, "grad_norm": 0.5078125, "grad_norm_var": 0.0030369917551676434, "learning_rate": 0.0001, "loss": 1.7332, "loss/crossentropy": 2.4901883602142334, "loss/fcd": 1.4375, "loss/idx": 11.0, "loss/logits": 0.2957487404346466, "step": 12877 }, { "epoch": 0.19229649317973108, "grad_norm": 0.265625, "grad_norm_var": 0.003193092346191406, "learning_rate": 0.0001, "loss": 1.373, "loss/crossentropy": 2.5627466440200806, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19333789497613907, "step": 12878 }, { "epoch": 0.19231142535034607, "grad_norm": 0.34375, "grad_norm_var": 0.0032073338826497395, "learning_rate": 0.0001, "loss": 1.5036, "loss/crossentropy": 2.656955122947693, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.23012296855449677, "step": 12879 }, { "epoch": 0.19232635752096103, "grad_norm": 0.314453125, "grad_norm_var": 0.0031558831532796225, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.4195507764816284, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.18955405801534653, "step": 12880 }, { "epoch": 0.19234128969157602, "grad_norm": 0.359375, "grad_norm_var": 0.0030382792154947918, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.764565348625183, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21835646033287048, "step": 12881 }, { "epoch": 0.192356221862191, "grad_norm": 0.333984375, "grad_norm_var": 0.0030437310536702475, "learning_rate": 0.0001, "loss": 1.4981, "loss/crossentropy": 2.5208390951156616, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2090097963809967, "step": 12882 }, { "epoch": 0.19237115403280597, "grad_norm": 0.322265625, "grad_norm_var": 0.002997573216756185, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.6263973712921143, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21200811862945557, "step": 12883 }, { "epoch": 0.19238608620342096, "grad_norm": 0.291015625, "grad_norm_var": 0.0030719598134358725, "learning_rate": 0.0001, "loss": 1.3591, "loss/crossentropy": 2.615379810333252, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1871999204158783, "step": 12884 }, { "epoch": 0.19240101837403595, "grad_norm": 0.296875, "grad_norm_var": 0.0031145572662353515, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.609472870826721, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.20195931941270828, "step": 12885 }, { "epoch": 0.1924159505446509, "grad_norm": 0.279296875, "grad_norm_var": 0.003210306167602539, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.7369202375411987, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1843050792813301, "step": 12886 }, { "epoch": 0.1924308827152659, "grad_norm": 0.3046875, "grad_norm_var": 0.0031096776326497394, "learning_rate": 0.0001, "loss": 1.4394, "loss/crossentropy": 2.3893686532974243, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20113345980644226, "step": 12887 }, { "epoch": 0.1924458148858809, "grad_norm": 0.263671875, "grad_norm_var": 0.0033191521962483723, "learning_rate": 0.0001, "loss": 1.2593, "loss/crossentropy": 2.6383860111236572, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16167447715997696, "step": 12888 }, { "epoch": 0.19246074705649588, "grad_norm": 0.3125, "grad_norm_var": 0.0032729466756184896, "learning_rate": 0.0001, "loss": 1.4361, "loss/crossentropy": 2.4201791286468506, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19392980635166168, "step": 12889 }, { "epoch": 0.19247567922711084, "grad_norm": 0.318359375, "grad_norm_var": 0.0032671451568603515, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.699098229408264, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.181611567735672, "step": 12890 }, { "epoch": 0.19249061139772583, "grad_norm": 0.275390625, "grad_norm_var": 0.003307453791300456, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 2.53923499584198, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1874633952975273, "step": 12891 }, { "epoch": 0.19250554356834082, "grad_norm": 0.314453125, "grad_norm_var": 0.0032718499501546224, "learning_rate": 0.0001, "loss": 1.6129, "loss/crossentropy": 2.4782259464263916, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.24571756273508072, "step": 12892 }, { "epoch": 0.19252047573895578, "grad_norm": 0.28515625, "grad_norm_var": 0.0007640679677327473, "learning_rate": 0.0001, "loss": 1.3065, "loss/crossentropy": 2.499908924102783, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1658843532204628, "step": 12893 }, { "epoch": 0.19253540790957077, "grad_norm": 0.29296875, "grad_norm_var": 0.0006670475006103516, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.7352964878082275, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2049277052283287, "step": 12894 }, { "epoch": 0.19255034008018576, "grad_norm": 0.333984375, "grad_norm_var": 0.000624847412109375, "learning_rate": 0.0001, "loss": 1.5086, "loss/crossentropy": 2.8156076669692993, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.20784753561019897, "step": 12895 }, { "epoch": 0.19256527225080075, "grad_norm": 0.380859375, "grad_norm_var": 0.0009739557902018229, "learning_rate": 0.0001, "loss": 1.626, "loss/crossentropy": 2.4533913135528564, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.22367661446332932, "step": 12896 }, { "epoch": 0.1925802044214157, "grad_norm": 0.302734375, "grad_norm_var": 0.0008038679758707682, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.6308491230010986, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18334250897169113, "step": 12897 }, { "epoch": 0.1925951365920307, "grad_norm": 0.6015625, "grad_norm_var": 0.00624993642171224, "learning_rate": 0.0001, "loss": 2.101, "loss/crossentropy": 2.6142196655273438, "loss/fcd": 1.75, "loss/idx": 11.0, "loss/logits": 0.3510410040616989, "step": 12898 }, { "epoch": 0.1926100687626457, "grad_norm": 0.298828125, "grad_norm_var": 0.00628808339436849, "learning_rate": 0.0001, "loss": 1.3784, "loss/crossentropy": 2.8296009302139282, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19482456892728806, "step": 12899 }, { "epoch": 0.19262500093326065, "grad_norm": 0.302734375, "grad_norm_var": 0.0062482198079427086, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.6422849893569946, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21737563610076904, "step": 12900 }, { "epoch": 0.19263993310387564, "grad_norm": 0.306640625, "grad_norm_var": 0.006220483779907226, "learning_rate": 0.0001, "loss": 1.3381, "loss/crossentropy": 2.5558491945266724, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18578772246837616, "step": 12901 }, { "epoch": 0.19265486527449063, "grad_norm": 0.283203125, "grad_norm_var": 0.006198485692342122, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.808030605316162, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.16659359633922577, "step": 12902 }, { "epoch": 0.1926697974451056, "grad_norm": 0.34375, "grad_norm_var": 0.0061953067779541016, "learning_rate": 0.0001, "loss": 1.4987, "loss/crossentropy": 2.4732874631881714, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.20963870733976364, "step": 12903 }, { "epoch": 0.19268472961572058, "grad_norm": 0.34375, "grad_norm_var": 0.005930074055989583, "learning_rate": 0.0001, "loss": 1.7125, "loss/crossentropy": 2.528750777244568, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.2516070753335953, "step": 12904 }, { "epoch": 0.19269966178633557, "grad_norm": 0.3046875, "grad_norm_var": 0.005953216552734375, "learning_rate": 0.0001, "loss": 1.5984, "loss/crossentropy": 2.643608570098877, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2468349188566208, "step": 12905 }, { "epoch": 0.19271459395695056, "grad_norm": 0.318359375, "grad_norm_var": 0.005953216552734375, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.5975958108901978, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1998668909072876, "step": 12906 }, { "epoch": 0.19272952612756553, "grad_norm": 0.28125, "grad_norm_var": 0.005912256240844726, "learning_rate": 0.0001, "loss": 1.3366, "loss/crossentropy": 2.644844174385071, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18039494007825851, "step": 12907 }, { "epoch": 0.19274445829818052, "grad_norm": 0.291015625, "grad_norm_var": 0.005998086929321289, "learning_rate": 0.0001, "loss": 1.3488, "loss/crossentropy": 2.539412260055542, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18867423385381699, "step": 12908 }, { "epoch": 0.1927593904687955, "grad_norm": 0.306640625, "grad_norm_var": 0.005900001525878907, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.8984785079956055, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19903704524040222, "step": 12909 }, { "epoch": 0.19277432263941047, "grad_norm": 0.267578125, "grad_norm_var": 0.00606840451558431, "learning_rate": 0.0001, "loss": 1.4931, "loss/crossentropy": 2.197495222091675, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21580088138580322, "step": 12910 }, { "epoch": 0.19278925481002546, "grad_norm": 0.294921875, "grad_norm_var": 0.0061389764149983725, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.5114575624465942, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.16779300570487976, "step": 12911 }, { "epoch": 0.19280418698064045, "grad_norm": 0.302734375, "grad_norm_var": 0.0059571425120035805, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.723499298095703, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18639197200536728, "step": 12912 }, { "epoch": 0.19281911915125544, "grad_norm": 0.3046875, "grad_norm_var": 0.00595239003499349, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.5087544918060303, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19694830477237701, "step": 12913 }, { "epoch": 0.1928340513218704, "grad_norm": 0.302734375, "grad_norm_var": 0.0003955682118733724, "learning_rate": 0.0001, "loss": 1.4188, "loss/crossentropy": 2.655640482902527, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20787429064512253, "step": 12914 }, { "epoch": 0.1928489834924854, "grad_norm": 0.287109375, "grad_norm_var": 0.0004112084706624349, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.417094588279724, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.16634006053209305, "step": 12915 }, { "epoch": 0.19286391566310038, "grad_norm": 0.28125, "grad_norm_var": 0.00043970743815104166, "learning_rate": 0.0001, "loss": 1.3422, "loss/crossentropy": 2.661401629447937, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18599064648151398, "step": 12916 }, { "epoch": 0.19287884783371534, "grad_norm": 0.29296875, "grad_norm_var": 0.0004415988922119141, "learning_rate": 0.0001, "loss": 1.5034, "loss/crossentropy": 2.4191726446151733, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22218643873929977, "step": 12917 }, { "epoch": 0.19289378000433033, "grad_norm": 0.287109375, "grad_norm_var": 0.00043358802795410154, "learning_rate": 0.0001, "loss": 1.3706, "loss/crossentropy": 2.7708146572113037, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19486801326274872, "step": 12918 }, { "epoch": 0.19290871217494532, "grad_norm": 0.33984375, "grad_norm_var": 0.00041209856669108074, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.7493101358413696, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19606082886457443, "step": 12919 }, { "epoch": 0.19292364434556028, "grad_norm": 0.3046875, "grad_norm_var": 0.00028176307678222655, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.4414182901382446, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1921411082148552, "step": 12920 }, { "epoch": 0.19293857651617527, "grad_norm": 0.29296875, "grad_norm_var": 0.00027985572814941405, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.607632279396057, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19092387706041336, "step": 12921 }, { "epoch": 0.19295350868679026, "grad_norm": 0.30078125, "grad_norm_var": 0.00024967193603515626, "learning_rate": 0.0001, "loss": 1.4567, "loss/crossentropy": 2.7066365480422974, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19892805814743042, "step": 12922 }, { "epoch": 0.19296844085740525, "grad_norm": 0.30859375, "grad_norm_var": 0.00024210611979166667, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.7737128734588623, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17940997332334518, "step": 12923 }, { "epoch": 0.1929833730280202, "grad_norm": 0.29296875, "grad_norm_var": 0.00024056434631347656, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.7052687406539917, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20613503456115723, "step": 12924 }, { "epoch": 0.1929983051986352, "grad_norm": 0.294921875, "grad_norm_var": 0.00023560523986816407, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.5536949634552, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2209339216351509, "step": 12925 }, { "epoch": 0.1930132373692502, "grad_norm": 0.322265625, "grad_norm_var": 0.00020623207092285156, "learning_rate": 0.0001, "loss": 1.537, "loss/crossentropy": 2.413161873817444, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.24008983373641968, "step": 12926 }, { "epoch": 0.19302816953986515, "grad_norm": 0.267578125, "grad_norm_var": 0.0002738793690999349, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.6508442163467407, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2060832679271698, "step": 12927 }, { "epoch": 0.19304310171048014, "grad_norm": 0.361328125, "grad_norm_var": 0.0005180199940999348, "learning_rate": 0.0001, "loss": 1.6003, "loss/crossentropy": 2.6788101196289062, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.26045334339141846, "step": 12928 }, { "epoch": 0.19305803388109513, "grad_norm": 0.291015625, "grad_norm_var": 0.0005259195963541667, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.539973258972168, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20659078657627106, "step": 12929 }, { "epoch": 0.1930729660517101, "grad_norm": 0.26953125, "grad_norm_var": 0.0005904992421468098, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.5628433227539062, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18653618544340134, "step": 12930 }, { "epoch": 0.19308789822232508, "grad_norm": 0.30859375, "grad_norm_var": 0.0005833307902018229, "learning_rate": 0.0001, "loss": 1.4979, "loss/crossentropy": 2.2880911827087402, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.18145682662725449, "step": 12931 }, { "epoch": 0.19310283039294007, "grad_norm": 0.27734375, "grad_norm_var": 0.0005945841471354167, "learning_rate": 0.0001, "loss": 1.4384, "loss/crossentropy": 2.467341184616089, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.204040065407753, "step": 12932 }, { "epoch": 0.19311776256355506, "grad_norm": 0.3203125, "grad_norm_var": 0.0006128311157226563, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.564815044403076, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.24761372059583664, "step": 12933 }, { "epoch": 0.19313269473417002, "grad_norm": 0.3125, "grad_norm_var": 0.0006010532379150391, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.7014529705047607, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18922283500432968, "step": 12934 }, { "epoch": 0.193147626904785, "grad_norm": 0.265625, "grad_norm_var": 0.0005913893381754557, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.6004709005355835, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18656755238771439, "step": 12935 }, { "epoch": 0.1931625590754, "grad_norm": 0.294921875, "grad_norm_var": 0.00059051513671875, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.5527673959732056, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21919043362140656, "step": 12936 }, { "epoch": 0.19317749124601497, "grad_norm": 0.2890625, "grad_norm_var": 0.0005945205688476562, "learning_rate": 0.0001, "loss": 1.4307, "loss/crossentropy": 2.5712087154388428, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20020243525505066, "step": 12937 }, { "epoch": 0.19319242341662995, "grad_norm": 0.283203125, "grad_norm_var": 0.0006086826324462891, "learning_rate": 0.0001, "loss": 1.3502, "loss/crossentropy": 2.6145882606506348, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18222886323928833, "step": 12938 }, { "epoch": 0.19320735558724494, "grad_norm": 0.287109375, "grad_norm_var": 0.0006057103474934896, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.93410062789917, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19912396371364594, "step": 12939 }, { "epoch": 0.19322228775785993, "grad_norm": 0.384765625, "grad_norm_var": 0.0010935306549072266, "learning_rate": 0.0001, "loss": 1.5002, "loss/crossentropy": 2.502390742301941, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.21505945175886154, "step": 12940 }, { "epoch": 0.1932372199284749, "grad_norm": 0.326171875, "grad_norm_var": 0.0011255741119384766, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.7211179733276367, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1865944340825081, "step": 12941 }, { "epoch": 0.19325215209908989, "grad_norm": 0.298828125, "grad_norm_var": 0.001102304458618164, "learning_rate": 0.0001, "loss": 1.598, "loss/crossentropy": 2.5820751190185547, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.26205362379550934, "step": 12942 }, { "epoch": 0.19326708426970488, "grad_norm": 0.26953125, "grad_norm_var": 0.0010934829711914062, "learning_rate": 0.0001, "loss": 1.4121, "loss/crossentropy": 2.518070697784424, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2050764560699463, "step": 12943 }, { "epoch": 0.19328201644031984, "grad_norm": 0.322265625, "grad_norm_var": 0.0008824030558268229, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.6039239168167114, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2113477662205696, "step": 12944 }, { "epoch": 0.19329694861093483, "grad_norm": 0.376953125, "grad_norm_var": 0.0012404759724934896, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.707436203956604, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19505105912685394, "step": 12945 }, { "epoch": 0.19331188078154982, "grad_norm": 0.302734375, "grad_norm_var": 0.00115049680074056, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.66709041595459, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18000555038452148, "step": 12946 }, { "epoch": 0.19332681295216478, "grad_norm": 0.35546875, "grad_norm_var": 0.0012946923573811848, "learning_rate": 0.0001, "loss": 1.5284, "loss/crossentropy": 2.547454595565796, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2432897984981537, "step": 12947 }, { "epoch": 0.19334174512277977, "grad_norm": 0.29296875, "grad_norm_var": 0.0012410322825113932, "learning_rate": 0.0001, "loss": 1.3564, "loss/crossentropy": 2.7597492933273315, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18843424320220947, "step": 12948 }, { "epoch": 0.19335667729339476, "grad_norm": 0.28515625, "grad_norm_var": 0.0012765089670817057, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.4371304512023926, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20590122044086456, "step": 12949 }, { "epoch": 0.19337160946400975, "grad_norm": 0.310546875, "grad_norm_var": 0.0012758890787760416, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.62880539894104, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19366716593503952, "step": 12950 }, { "epoch": 0.1933865416346247, "grad_norm": 0.2890625, "grad_norm_var": 0.0011744181315104167, "learning_rate": 0.0001, "loss": 1.5361, "loss/crossentropy": 2.4500290155410767, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.24314214289188385, "step": 12951 }, { "epoch": 0.1934014738052397, "grad_norm": 0.3671875, "grad_norm_var": 0.001350259780883789, "learning_rate": 0.0001, "loss": 1.7324, "loss/crossentropy": 2.5867034196853638, "loss/fcd": 1.44921875, "loss/idx": 11.0, "loss/logits": 0.2832246199250221, "step": 12952 }, { "epoch": 0.1934164059758547, "grad_norm": 0.310546875, "grad_norm_var": 0.00130462646484375, "learning_rate": 0.0001, "loss": 1.4367, "loss/crossentropy": 2.6484415531158447, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.1905590444803238, "step": 12953 }, { "epoch": 0.19343133814646965, "grad_norm": 0.291015625, "grad_norm_var": 0.0012738545735677083, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.551292061805725, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22135280072689056, "step": 12954 }, { "epoch": 0.19344627031708464, "grad_norm": 0.291015625, "grad_norm_var": 0.001259295145670573, "learning_rate": 0.0001, "loss": 1.3801, "loss/crossentropy": 2.72893750667572, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18871185928583145, "step": 12955 }, { "epoch": 0.19346120248769963, "grad_norm": 0.283203125, "grad_norm_var": 0.0009881973266601563, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.6730599403381348, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18258091062307358, "step": 12956 }, { "epoch": 0.19347613465831462, "grad_norm": 0.275390625, "grad_norm_var": 0.00104522705078125, "learning_rate": 0.0001, "loss": 1.2838, "loss/crossentropy": 2.452457308769226, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16664093732833862, "step": 12957 }, { "epoch": 0.19349106682892958, "grad_norm": 0.26171875, "grad_norm_var": 0.001174783706665039, "learning_rate": 0.0001, "loss": 1.3382, "loss/crossentropy": 2.650221824645996, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17800355702638626, "step": 12958 }, { "epoch": 0.19350599899954457, "grad_norm": 0.310546875, "grad_norm_var": 0.0010843276977539062, "learning_rate": 0.0001, "loss": 1.401, "loss/crossentropy": 2.6592079401016235, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20181754976511002, "step": 12959 }, { "epoch": 0.19352093117015956, "grad_norm": 0.291015625, "grad_norm_var": 0.0010853449503580729, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.584745407104492, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21893801540136337, "step": 12960 }, { "epoch": 0.19353586334077452, "grad_norm": 0.3046875, "grad_norm_var": 0.0007271925608317057, "learning_rate": 0.0001, "loss": 1.4302, "loss/crossentropy": 2.4995583295822144, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.18801740556955338, "step": 12961 }, { "epoch": 0.1935507955113895, "grad_norm": 0.302734375, "grad_norm_var": 0.0007271925608317057, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.5807617902755737, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20681827515363693, "step": 12962 }, { "epoch": 0.1935657276820045, "grad_norm": 0.326171875, "grad_norm_var": 0.0005695978800455729, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.894270896911621, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18466652184724808, "step": 12963 }, { "epoch": 0.19358065985261946, "grad_norm": 0.28125, "grad_norm_var": 0.0005884806315104167, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.8717132806777954, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19347433745861053, "step": 12964 }, { "epoch": 0.19359559202323445, "grad_norm": 0.2890625, "grad_norm_var": 0.0005823135375976563, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.7071441411972046, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.23706887662410736, "step": 12965 }, { "epoch": 0.19361052419384944, "grad_norm": 0.326171875, "grad_norm_var": 0.0006214777628580729, "learning_rate": 0.0001, "loss": 1.3713, "loss/crossentropy": 2.5992709398269653, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.16427310556173325, "step": 12966 }, { "epoch": 0.19362545636446443, "grad_norm": 0.29296875, "grad_norm_var": 0.0006167093912760416, "learning_rate": 0.0001, "loss": 1.4375, "loss/crossentropy": 2.480005383491516, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.207035630941391, "step": 12967 }, { "epoch": 0.1936403885350794, "grad_norm": 0.263671875, "grad_norm_var": 0.0003631432851155599, "learning_rate": 0.0001, "loss": 1.2762, "loss/crossentropy": 2.638411283493042, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.17076103389263153, "step": 12968 }, { "epoch": 0.19365532070569438, "grad_norm": 0.298828125, "grad_norm_var": 0.0003455956776936849, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.541624665260315, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18959145992994308, "step": 12969 }, { "epoch": 0.19367025287630937, "grad_norm": 0.26953125, "grad_norm_var": 0.0003803888956705729, "learning_rate": 0.0001, "loss": 1.351, "loss/crossentropy": 2.4929615259170532, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19088680297136307, "step": 12970 }, { "epoch": 0.19368518504692434, "grad_norm": 0.287109375, "grad_norm_var": 0.00038172403971354165, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.6425654888153076, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19219227880239487, "step": 12971 }, { "epoch": 0.19370011721753932, "grad_norm": 0.298828125, "grad_norm_var": 0.0003796895345052083, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.6676487922668457, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20936934649944305, "step": 12972 }, { "epoch": 0.19371504938815431, "grad_norm": 0.2734375, "grad_norm_var": 0.0003843784332275391, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.465531587600708, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1948646754026413, "step": 12973 }, { "epoch": 0.1937299815587693, "grad_norm": 0.296875, "grad_norm_var": 0.00031800270080566405, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.655813455581665, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.17426686733961105, "step": 12974 }, { "epoch": 0.19374491372938427, "grad_norm": 0.291015625, "grad_norm_var": 0.0003002007802327474, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.569991111755371, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19801422953605652, "step": 12975 }, { "epoch": 0.19375984589999926, "grad_norm": 0.296875, "grad_norm_var": 0.0003005345662434896, "learning_rate": 0.0001, "loss": 1.3886, "loss/crossentropy": 2.5714486837387085, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18937043100595474, "step": 12976 }, { "epoch": 0.19377477807061425, "grad_norm": 0.345703125, "grad_norm_var": 0.0004657586415608724, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.526023745536804, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2099320888519287, "step": 12977 }, { "epoch": 0.1937897102412292, "grad_norm": 0.37109375, "grad_norm_var": 0.0008167902628580729, "learning_rate": 0.0001, "loss": 1.6596, "loss/crossentropy": 2.3952548503875732, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.26511603593826294, "step": 12978 }, { "epoch": 0.1938046424118442, "grad_norm": 0.314453125, "grad_norm_var": 0.0007853190104166667, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.4326083660125732, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1862337440252304, "step": 12979 }, { "epoch": 0.1938195745824592, "grad_norm": 0.341796875, "grad_norm_var": 0.0008646488189697265, "learning_rate": 0.0001, "loss": 1.4595, "loss/crossentropy": 2.656293272972107, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.18602384626865387, "step": 12980 }, { "epoch": 0.19383450675307415, "grad_norm": 0.3125, "grad_norm_var": 0.000853586196899414, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.497393846511841, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.22167667001485825, "step": 12981 }, { "epoch": 0.19384943892368914, "grad_norm": 0.287109375, "grad_norm_var": 0.0008389631907145183, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.637410044670105, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16089213639497757, "step": 12982 }, { "epoch": 0.19386437109430413, "grad_norm": 0.310546875, "grad_norm_var": 0.0008356730143229167, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.6712170839309692, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2391323372721672, "step": 12983 }, { "epoch": 0.19387930326491912, "grad_norm": 0.365234375, "grad_norm_var": 0.0009381612141927083, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.6288129091262817, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20778323709964752, "step": 12984 }, { "epoch": 0.19389423543553408, "grad_norm": 0.259765625, "grad_norm_var": 0.0010920206705729166, "learning_rate": 0.0001, "loss": 1.2889, "loss/crossentropy": 2.5946762561798096, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17557203769683838, "step": 12985 }, { "epoch": 0.19390916760614907, "grad_norm": 0.296875, "grad_norm_var": 0.000999895731608073, "learning_rate": 0.0001, "loss": 1.3591, "loss/crossentropy": 2.6151013374328613, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19116738438606262, "step": 12986 }, { "epoch": 0.19392409977676406, "grad_norm": 0.3359375, "grad_norm_var": 0.0010042667388916015, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.6816290616989136, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2265138030052185, "step": 12987 }, { "epoch": 0.19393903194737902, "grad_norm": 0.333984375, "grad_norm_var": 0.0010179996490478516, "learning_rate": 0.0001, "loss": 1.5525, "loss/crossentropy": 2.4882965087890625, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22042158246040344, "step": 12988 }, { "epoch": 0.193953964117994, "grad_norm": 0.33203125, "grad_norm_var": 0.0009111881256103516, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.7152254581451416, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2127232924103737, "step": 12989 }, { "epoch": 0.193968896288609, "grad_norm": 0.28125, "grad_norm_var": 0.0009709517161051432, "learning_rate": 0.0001, "loss": 1.4259, "loss/crossentropy": 2.571275234222412, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20711617171764374, "step": 12990 }, { "epoch": 0.19398382845922396, "grad_norm": 0.337890625, "grad_norm_var": 0.0009442488352457682, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.7545944452285767, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21647442877292633, "step": 12991 }, { "epoch": 0.19399876062983895, "grad_norm": 0.4453125, "grad_norm_var": 0.001859903335571289, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.4902329444885254, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2072017937898636, "step": 12992 }, { "epoch": 0.19401369280045394, "grad_norm": 0.322265625, "grad_norm_var": 0.0018435001373291015, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.8621270656585693, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20750559121370316, "step": 12993 }, { "epoch": 0.19402862497106893, "grad_norm": 0.28515625, "grad_norm_var": 0.0018113295237223307, "learning_rate": 0.0001, "loss": 1.3034, "loss/crossentropy": 2.6702208518981934, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.18228302896022797, "step": 12994 }, { "epoch": 0.1940435571416839, "grad_norm": 0.28515625, "grad_norm_var": 0.0018969217936197916, "learning_rate": 0.0001, "loss": 1.3868, "loss/crossentropy": 2.3990057706832886, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19541149586439133, "step": 12995 }, { "epoch": 0.19405848931229888, "grad_norm": 0.2890625, "grad_norm_var": 0.0019231001536051432, "learning_rate": 0.0001, "loss": 1.3271, "loss/crossentropy": 2.8262027502059937, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17472978681325912, "step": 12996 }, { "epoch": 0.19407342148291387, "grad_norm": 0.3125, "grad_norm_var": 0.0019231001536051432, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.776865005493164, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20879513770341873, "step": 12997 }, { "epoch": 0.19408835365352883, "grad_norm": 0.3515625, "grad_norm_var": 0.0019215265909830728, "learning_rate": 0.0001, "loss": 1.7816, "loss/crossentropy": 2.5450823307037354, "loss/fcd": 1.5, "loss/idx": 11.0, "loss/logits": 0.28164372593164444, "step": 12998 }, { "epoch": 0.19410328582414382, "grad_norm": 0.294921875, "grad_norm_var": 0.001959673563639323, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6986435651779175, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18873322010040283, "step": 12999 }, { "epoch": 0.1941182179947588, "grad_norm": 0.3515625, "grad_norm_var": 0.0018899122873942058, "learning_rate": 0.0001, "loss": 1.7105, "loss/crossentropy": 2.8094496726989746, "loss/fcd": 1.44921875, "loss/idx": 11.0, "loss/logits": 0.26126258075237274, "step": 13000 }, { "epoch": 0.1941331501653738, "grad_norm": 0.376953125, "grad_norm_var": 0.0018117109934488933, "learning_rate": 0.0001, "loss": 1.4565, "loss/crossentropy": 2.692988157272339, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19864223152399063, "step": 13001 }, { "epoch": 0.19414808233598876, "grad_norm": 0.28125, "grad_norm_var": 0.0018897851308186848, "learning_rate": 0.0001, "loss": 1.3747, "loss/crossentropy": 2.665148138999939, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19505519419908524, "step": 13002 }, { "epoch": 0.19416301450660375, "grad_norm": 0.26953125, "grad_norm_var": 0.0020778497060139974, "learning_rate": 0.0001, "loss": 1.3867, "loss/crossentropy": 2.7461369037628174, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19917133450508118, "step": 13003 }, { "epoch": 0.19417794667721874, "grad_norm": 0.29296875, "grad_norm_var": 0.002116902669270833, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.631170630455017, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21966839581727982, "step": 13004 }, { "epoch": 0.1941928788478337, "grad_norm": 0.3125, "grad_norm_var": 0.002107683817545573, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.6813405752182007, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20215269923210144, "step": 13005 }, { "epoch": 0.1942078110184487, "grad_norm": 0.298828125, "grad_norm_var": 0.0020405928293863933, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.479518175125122, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16804690659046173, "step": 13006 }, { "epoch": 0.19422274318906368, "grad_norm": 0.287109375, "grad_norm_var": 0.002075306574503581, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.4274041652679443, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17779196053743362, "step": 13007 }, { "epoch": 0.19423767535967865, "grad_norm": 0.271484375, "grad_norm_var": 0.0009676615397135416, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.5555895566940308, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19523442536592484, "step": 13008 }, { "epoch": 0.19425260753029364, "grad_norm": 0.3125, "grad_norm_var": 0.0009513696034749349, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.4969085454940796, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20970462262630463, "step": 13009 }, { "epoch": 0.19426753970090863, "grad_norm": 0.30859375, "grad_norm_var": 0.0009250481923421224, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.7397563457489014, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19491708278656006, "step": 13010 }, { "epoch": 0.19428247187152362, "grad_norm": 0.3203125, "grad_norm_var": 0.0009044488271077474, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.6017338037490845, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2206452712416649, "step": 13011 }, { "epoch": 0.19429740404213858, "grad_norm": 0.28125, "grad_norm_var": 0.0009282271067301433, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.6591014862060547, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1846272498369217, "step": 13012 }, { "epoch": 0.19431233621275357, "grad_norm": 0.3203125, "grad_norm_var": 0.0009370009104410807, "learning_rate": 0.0001, "loss": 1.488, "loss/crossentropy": 2.5815078020095825, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21064499020576477, "step": 13013 }, { "epoch": 0.19432726838336856, "grad_norm": 0.283203125, "grad_norm_var": 0.0008340835571289063, "learning_rate": 0.0001, "loss": 1.4163, "loss/crossentropy": 2.5930824279785156, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2014605551958084, "step": 13014 }, { "epoch": 0.19434220055398352, "grad_norm": 0.80078125, "grad_norm_var": 0.016218169530232748, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.3050631284713745, "loss/fcd": 1.47265625, "loss/idx": 11.0, "loss/logits": 0.2416067197918892, "step": 13015 }, { "epoch": 0.1943571327245985, "grad_norm": 0.314453125, "grad_norm_var": 0.016225115458170573, "learning_rate": 0.0001, "loss": 1.4827, "loss/crossentropy": 2.6755040884017944, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20535773038864136, "step": 13016 }, { "epoch": 0.1943720648952135, "grad_norm": 0.55859375, "grad_norm_var": 0.019345585505167642, "learning_rate": 0.0001, "loss": 1.3727, "loss/crossentropy": 2.474701166152954, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18912433087825775, "step": 13017 }, { "epoch": 0.1943869970658285, "grad_norm": 0.28515625, "grad_norm_var": 0.019313542048136394, "learning_rate": 0.0001, "loss": 1.4613, "loss/crossentropy": 2.4568493366241455, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.19959115982055664, "step": 13018 }, { "epoch": 0.19440192923644345, "grad_norm": 0.333984375, "grad_norm_var": 0.018925921122233073, "learning_rate": 0.0001, "loss": 1.4582, "loss/crossentropy": 2.70623517036438, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21598868817090988, "step": 13019 }, { "epoch": 0.19441686140705844, "grad_norm": 0.296875, "grad_norm_var": 0.01889775594075521, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.5012906789779663, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2211521565914154, "step": 13020 }, { "epoch": 0.19443179357767343, "grad_norm": 0.28125, "grad_norm_var": 0.01911137898763021, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.583197236061096, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19774195551872253, "step": 13021 }, { "epoch": 0.1944467257482884, "grad_norm": 0.263671875, "grad_norm_var": 0.01941521962483724, "learning_rate": 0.0001, "loss": 1.3381, "loss/crossentropy": 2.5960875749588013, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17009564489126205, "step": 13022 }, { "epoch": 0.19446165791890338, "grad_norm": 0.28515625, "grad_norm_var": 0.01943052609761556, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.6709253787994385, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17426908016204834, "step": 13023 }, { "epoch": 0.19447659008951837, "grad_norm": 0.2578125, "grad_norm_var": 0.01957594553629557, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.7112799882888794, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20456428825855255, "step": 13024 }, { "epoch": 0.19449152226013333, "grad_norm": 0.287109375, "grad_norm_var": 0.019722859064737957, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.3270524740219116, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21832095086574554, "step": 13025 }, { "epoch": 0.19450645443074832, "grad_norm": 0.26171875, "grad_norm_var": 0.02007152239481608, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.781067967414856, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1890798807144165, "step": 13026 }, { "epoch": 0.1945213866013633, "grad_norm": 0.302734375, "grad_norm_var": 0.02013575236002604, "learning_rate": 0.0001, "loss": 1.4248, "loss/crossentropy": 2.6864627599716187, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19820280373096466, "step": 13027 }, { "epoch": 0.1945363187719783, "grad_norm": 0.37890625, "grad_norm_var": 0.01998793284098307, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.6772409677505493, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21832669526338577, "step": 13028 }, { "epoch": 0.19455125094259326, "grad_norm": 0.423828125, "grad_norm_var": 0.020324055353800455, "learning_rate": 0.0001, "loss": 1.5724, "loss/crossentropy": 2.5233235359191895, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23252413421869278, "step": 13029 }, { "epoch": 0.19456618311320825, "grad_norm": 0.34375, "grad_norm_var": 0.020006243387858072, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.8672170639038086, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.23447459936141968, "step": 13030 }, { "epoch": 0.19458111528382324, "grad_norm": 0.30078125, "grad_norm_var": 0.005894915262858073, "learning_rate": 0.0001, "loss": 1.407, "loss/crossentropy": 2.7454988956451416, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20385365188121796, "step": 13031 }, { "epoch": 0.1945960474544382, "grad_norm": 0.30859375, "grad_norm_var": 0.0059041182200113935, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.7109620571136475, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1649395152926445, "step": 13032 }, { "epoch": 0.1946109796250532, "grad_norm": 0.279296875, "grad_norm_var": 0.002010599772135417, "learning_rate": 0.0001, "loss": 1.3394, "loss/crossentropy": 2.879097580909729, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18708957731723785, "step": 13033 }, { "epoch": 0.19462591179566818, "grad_norm": 0.28125, "grad_norm_var": 0.002022234598795573, "learning_rate": 0.0001, "loss": 1.2809, "loss/crossentropy": 2.468254327774048, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1637391597032547, "step": 13034 }, { "epoch": 0.19464084396628317, "grad_norm": 0.310546875, "grad_norm_var": 0.001967302958170573, "learning_rate": 0.0001, "loss": 1.5393, "loss/crossentropy": 2.221173644065857, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.23457194864749908, "step": 13035 }, { "epoch": 0.19465577613689813, "grad_norm": 0.32421875, "grad_norm_var": 0.00198822021484375, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.468099355697632, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22494655847549438, "step": 13036 }, { "epoch": 0.19467070830751312, "grad_norm": 0.291015625, "grad_norm_var": 0.0019623915354410808, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.8565794229507446, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.1947137415409088, "step": 13037 }, { "epoch": 0.1946856404781281, "grad_norm": 0.3671875, "grad_norm_var": 0.0020441055297851563, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.7509512901306152, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2187277227640152, "step": 13038 }, { "epoch": 0.19470057264874308, "grad_norm": 0.337890625, "grad_norm_var": 0.002023935317993164, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.547430634498596, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.23623086512088776, "step": 13039 }, { "epoch": 0.19471550481935807, "grad_norm": 0.296875, "grad_norm_var": 0.0018160343170166016, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.5950311422348022, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18473535776138306, "step": 13040 }, { "epoch": 0.19473043698997305, "grad_norm": 0.326171875, "grad_norm_var": 0.0017480055491129558, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.6424015760421753, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.28928209841251373, "step": 13041 }, { "epoch": 0.19474536916058802, "grad_norm": 0.326171875, "grad_norm_var": 0.001498858133951823, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.6169700622558594, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20500122010707855, "step": 13042 }, { "epoch": 0.194760301331203, "grad_norm": 0.279296875, "grad_norm_var": 0.001602617899576823, "learning_rate": 0.0001, "loss": 1.3069, "loss/crossentropy": 2.654839873313904, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.1818580999970436, "step": 13043 }, { "epoch": 0.194775233501818, "grad_norm": 0.271484375, "grad_norm_var": 0.0015300591786702475, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.554171919822693, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19345618039369583, "step": 13044 }, { "epoch": 0.19479016567243299, "grad_norm": 0.2734375, "grad_norm_var": 0.0007969538370768229, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.6106879711151123, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21430359035730362, "step": 13045 }, { "epoch": 0.19480509784304795, "grad_norm": 0.28125, "grad_norm_var": 0.0007379531860351562, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.5266255140304565, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18196634948253632, "step": 13046 }, { "epoch": 0.19482003001366294, "grad_norm": 0.302734375, "grad_norm_var": 0.0007374922434488933, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.4267218112945557, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1658790111541748, "step": 13047 }, { "epoch": 0.19483496218427793, "grad_norm": 0.3046875, "grad_norm_var": 0.0007358392079671224, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.515745162963867, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2131948471069336, "step": 13048 }, { "epoch": 0.1948498943548929, "grad_norm": 0.298828125, "grad_norm_var": 0.0006970564524332682, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.5432077646255493, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1680251657962799, "step": 13049 }, { "epoch": 0.19486482652550788, "grad_norm": 0.3125, "grad_norm_var": 0.0006609439849853515, "learning_rate": 0.0001, "loss": 1.4781, "loss/crossentropy": 2.6604756116867065, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.23590422421693802, "step": 13050 }, { "epoch": 0.19487975869612287, "grad_norm": 0.35546875, "grad_norm_var": 0.0008111953735351563, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.543608069419861, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21340475976467133, "step": 13051 }, { "epoch": 0.19489469086673783, "grad_norm": 0.322265625, "grad_norm_var": 0.0008075555165608724, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 1.9805002212524414, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18607201427221298, "step": 13052 }, { "epoch": 0.19490962303735282, "grad_norm": 0.291015625, "grad_norm_var": 0.0008075555165608724, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.5426989793777466, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1839493066072464, "step": 13053 }, { "epoch": 0.1949245552079678, "grad_norm": 0.283203125, "grad_norm_var": 0.0005990982055664063, "learning_rate": 0.0001, "loss": 1.3534, "loss/crossentropy": 2.615543246269226, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18932563066482544, "step": 13054 }, { "epoch": 0.1949394873785828, "grad_norm": 0.294921875, "grad_norm_var": 0.0005200703938802084, "learning_rate": 0.0001, "loss": 1.4257, "loss/crossentropy": 2.5671603679656982, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.1952325403690338, "step": 13055 }, { "epoch": 0.19495441954919776, "grad_norm": 0.271484375, "grad_norm_var": 0.0005752404530843098, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.5536563396453857, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20185797661542892, "step": 13056 }, { "epoch": 0.19496935171981275, "grad_norm": 0.5234375, "grad_norm_var": 0.003704071044921875, "learning_rate": 0.0001, "loss": 1.4815, "loss/crossentropy": 2.6605318784713745, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20412751287221909, "step": 13057 }, { "epoch": 0.19498428389042774, "grad_norm": 0.31640625, "grad_norm_var": 0.003691593805948893, "learning_rate": 0.0001, "loss": 1.412, "loss/crossentropy": 2.5816491842269897, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1932447925209999, "step": 13058 }, { "epoch": 0.1949992160610427, "grad_norm": 0.26171875, "grad_norm_var": 0.003786150614420573, "learning_rate": 0.0001, "loss": 1.2965, "loss/crossentropy": 2.7695289850234985, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17149274051189423, "step": 13059 }, { "epoch": 0.1950141482316577, "grad_norm": 0.279296875, "grad_norm_var": 0.0037495295206705728, "learning_rate": 0.0001, "loss": 1.382, "loss/crossentropy": 2.4952540397644043, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1905459240078926, "step": 13060 }, { "epoch": 0.19502908040227268, "grad_norm": 0.259765625, "grad_norm_var": 0.0038293043772379557, "learning_rate": 0.0001, "loss": 1.2287, "loss/crossentropy": 2.5746930837631226, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.1544754058122635, "step": 13061 }, { "epoch": 0.19504401257288767, "grad_norm": 0.310546875, "grad_norm_var": 0.003770891825358073, "learning_rate": 0.0001, "loss": 1.3388, "loss/crossentropy": 2.6329126358032227, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18641721457242966, "step": 13062 }, { "epoch": 0.19505894474350263, "grad_norm": 0.294921875, "grad_norm_var": 0.0037841161092122396, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.7415882349014282, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22591343522071838, "step": 13063 }, { "epoch": 0.19507387691411762, "grad_norm": 0.3203125, "grad_norm_var": 0.0037856419881184896, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.81383216381073, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17947380989789963, "step": 13064 }, { "epoch": 0.1950888090847326, "grad_norm": 0.275390625, "grad_norm_var": 0.0038619359334309896, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.6352330446243286, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18633297830820084, "step": 13065 }, { "epoch": 0.19510374125534757, "grad_norm": 0.271484375, "grad_norm_var": 0.003957732518513998, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.656161308288574, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19858933985233307, "step": 13066 }, { "epoch": 0.19511867342596256, "grad_norm": 0.265625, "grad_norm_var": 0.003896315892537435, "learning_rate": 0.0001, "loss": 1.3424, "loss/crossentropy": 2.6343988180160522, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18228194117546082, "step": 13067 }, { "epoch": 0.19513360559657755, "grad_norm": 0.275390625, "grad_norm_var": 0.00391081174214681, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.4366923570632935, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1760367825627327, "step": 13068 }, { "epoch": 0.19514853776719251, "grad_norm": 0.25390625, "grad_norm_var": 0.004039764404296875, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.5629249811172485, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.17804181575775146, "step": 13069 }, { "epoch": 0.1951634699378075, "grad_norm": 0.294921875, "grad_norm_var": 0.004026222229003906, "learning_rate": 0.0001, "loss": 1.4597, "loss/crossentropy": 2.5731568336486816, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21359217911958694, "step": 13070 }, { "epoch": 0.1951784021084225, "grad_norm": 0.279296875, "grad_norm_var": 0.00404809315999349, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.4426755905151367, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.16338827461004257, "step": 13071 }, { "epoch": 0.19519333427903748, "grad_norm": 0.32421875, "grad_norm_var": 0.004041655858357748, "learning_rate": 0.0001, "loss": 1.3452, "loss/crossentropy": 2.987370252609253, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18509234488010406, "step": 13072 }, { "epoch": 0.19520826644965245, "grad_norm": 0.28515625, "grad_norm_var": 0.0005046685536702473, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.5823715925216675, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1785305216908455, "step": 13073 }, { "epoch": 0.19522319862026744, "grad_norm": 0.365234375, "grad_norm_var": 0.0008547465006510417, "learning_rate": 0.0001, "loss": 1.7004, "loss/crossentropy": 2.4865082502365112, "loss/fcd": 1.421875, "loss/idx": 11.0, "loss/logits": 0.27850258350372314, "step": 13074 }, { "epoch": 0.19523813079088242, "grad_norm": 0.29296875, "grad_norm_var": 0.0008038838704427083, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.3504843711853027, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1746440976858139, "step": 13075 }, { "epoch": 0.1952530629614974, "grad_norm": 0.314453125, "grad_norm_var": 0.0008284886678059896, "learning_rate": 0.0001, "loss": 1.3864, "loss/crossentropy": 2.7102394104003906, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1950201392173767, "step": 13076 }, { "epoch": 0.19526799513211238, "grad_norm": 0.298828125, "grad_norm_var": 0.0007521947224934896, "learning_rate": 0.0001, "loss": 1.4196, "loss/crossentropy": 2.6953481435775757, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19307354092597961, "step": 13077 }, { "epoch": 0.19528292730272737, "grad_norm": 0.296875, "grad_norm_var": 0.0007358392079671224, "learning_rate": 0.0001, "loss": 1.5643, "loss/crossentropy": 2.3899132013320923, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.22440816462039948, "step": 13078 }, { "epoch": 0.19529785947334236, "grad_norm": 0.26953125, "grad_norm_var": 0.0007740656534830729, "learning_rate": 0.0001, "loss": 1.3439, "loss/crossentropy": 2.687647581100464, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1719863936305046, "step": 13079 }, { "epoch": 0.19531279164395732, "grad_norm": 0.310546875, "grad_norm_var": 0.0007441043853759766, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.72891104221344, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2231261283159256, "step": 13080 }, { "epoch": 0.1953277238145723, "grad_norm": 0.326171875, "grad_norm_var": 0.0007920424143473307, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.6965943574905396, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20768684893846512, "step": 13081 }, { "epoch": 0.1953426559851873, "grad_norm": 0.3046875, "grad_norm_var": 0.0007555643717447917, "learning_rate": 0.0001, "loss": 1.5179, "loss/crossentropy": 2.7067174911499023, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23275940120220184, "step": 13082 }, { "epoch": 0.19535758815580226, "grad_norm": 0.287109375, "grad_norm_var": 0.0006934960683186848, "learning_rate": 0.0001, "loss": 1.4145, "loss/crossentropy": 2.5630955696105957, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.21135380864143372, "step": 13083 }, { "epoch": 0.19537252032641725, "grad_norm": 0.30078125, "grad_norm_var": 0.0006548563639322916, "learning_rate": 0.0001, "loss": 1.4958, "loss/crossentropy": 2.7354859113693237, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22630048543214798, "step": 13084 }, { "epoch": 0.19538745249703224, "grad_norm": 0.333984375, "grad_norm_var": 0.0005603631337483724, "learning_rate": 0.0001, "loss": 1.5781, "loss/crossentropy": 2.5623910427093506, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.24998440593481064, "step": 13085 }, { "epoch": 0.1954023846676472, "grad_norm": 0.3984375, "grad_norm_var": 0.001086870829264323, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.678462862968445, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2053295075893402, "step": 13086 }, { "epoch": 0.1954173168382622, "grad_norm": 0.322265625, "grad_norm_var": 0.0010162353515625, "learning_rate": 0.0001, "loss": 1.5046, "loss/crossentropy": 2.6257896423339844, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22337304800748825, "step": 13087 }, { "epoch": 0.19543224900887718, "grad_norm": 0.275390625, "grad_norm_var": 0.00110166867574056, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.4712265729904175, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16151226311922073, "step": 13088 }, { "epoch": 0.19544718117949217, "grad_norm": 0.275390625, "grad_norm_var": 0.001141802469889323, "learning_rate": 0.0001, "loss": 1.3124, "loss/crossentropy": 2.5957529544830322, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17572157084941864, "step": 13089 }, { "epoch": 0.19546211335010713, "grad_norm": 0.306640625, "grad_norm_var": 0.0009310404459635417, "learning_rate": 0.0001, "loss": 1.4044, "loss/crossentropy": 2.6166096925735474, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.18955695629119873, "step": 13090 }, { "epoch": 0.19547704552072212, "grad_norm": 0.298828125, "grad_norm_var": 0.0009221235911051432, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.6193090677261353, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19983602315187454, "step": 13091 }, { "epoch": 0.1954919776913371, "grad_norm": 0.29296875, "grad_norm_var": 0.0009310404459635417, "learning_rate": 0.0001, "loss": 1.3189, "loss/crossentropy": 2.565080404281616, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17438587546348572, "step": 13092 }, { "epoch": 0.19550690986195207, "grad_norm": 0.302734375, "grad_norm_var": 0.0009281794230143229, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.4131895303726196, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22544585913419724, "step": 13093 }, { "epoch": 0.19552184203256706, "grad_norm": 0.38671875, "grad_norm_var": 0.0013186136881510417, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.591077208518982, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22990170121192932, "step": 13094 }, { "epoch": 0.19553677420318205, "grad_norm": 0.263671875, "grad_norm_var": 0.0013539473215738933, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.645379424095154, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20548313111066818, "step": 13095 }, { "epoch": 0.19555170637379704, "grad_norm": 0.298828125, "grad_norm_var": 0.0013642470041910807, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.482788562774658, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19297119975090027, "step": 13096 }, { "epoch": 0.195566638544412, "grad_norm": 0.302734375, "grad_norm_var": 0.0013508955637613932, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.796888589859009, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18294428288936615, "step": 13097 }, { "epoch": 0.195581570715027, "grad_norm": 0.25390625, "grad_norm_var": 0.0015443007151285807, "learning_rate": 0.0001, "loss": 1.247, "loss/crossentropy": 2.5212807655334473, "loss/fcd": 1.083984375, "loss/idx": 11.0, "loss/logits": 0.16305457800626755, "step": 13098 }, { "epoch": 0.19559650288564198, "grad_norm": 0.400390625, "grad_norm_var": 0.00205686887105306, "learning_rate": 0.0001, "loss": 1.746, "loss/crossentropy": 2.8241103887557983, "loss/fcd": 1.48828125, "loss/idx": 11.0, "loss/logits": 0.2577321082353592, "step": 13099 }, { "epoch": 0.19561143505625694, "grad_norm": 0.291015625, "grad_norm_var": 0.002079200744628906, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.5925464630126953, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19297394156455994, "step": 13100 }, { "epoch": 0.19562636722687193, "grad_norm": 0.287109375, "grad_norm_var": 0.0020837783813476562, "learning_rate": 0.0001, "loss": 1.3603, "loss/crossentropy": 2.434756636619568, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1766999214887619, "step": 13101 }, { "epoch": 0.19564129939748692, "grad_norm": 0.248046875, "grad_norm_var": 0.001720285415649414, "learning_rate": 0.0001, "loss": 1.2414, "loss/crossentropy": 2.562556505203247, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.15941770374774933, "step": 13102 }, { "epoch": 0.19565623156810188, "grad_norm": 0.26171875, "grad_norm_var": 0.0017730077107747397, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.7010252475738525, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18609192222356796, "step": 13103 }, { "epoch": 0.19567116373871687, "grad_norm": 0.2890625, "grad_norm_var": 0.0017459710439046224, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.707445740699768, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18672245740890503, "step": 13104 }, { "epoch": 0.19568609590933186, "grad_norm": 0.296875, "grad_norm_var": 0.0017115275065104167, "learning_rate": 0.0001, "loss": 1.607, "loss/crossentropy": 2.409639358520508, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.24759627878665924, "step": 13105 }, { "epoch": 0.19570102807994685, "grad_norm": 0.349609375, "grad_norm_var": 0.0018716812133789062, "learning_rate": 0.0001, "loss": 1.5636, "loss/crossentropy": 2.7147048711776733, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.23548096418380737, "step": 13106 }, { "epoch": 0.19571596025056182, "grad_norm": 0.296875, "grad_norm_var": 0.0018726189931233723, "learning_rate": 0.0001, "loss": 1.4675, "loss/crossentropy": 2.64485764503479, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.23705071210861206, "step": 13107 }, { "epoch": 0.1957308924211768, "grad_norm": 0.2734375, "grad_norm_var": 0.0019183953603108723, "learning_rate": 0.0001, "loss": 1.3839, "loss/crossentropy": 2.359807848930359, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18471115827560425, "step": 13108 }, { "epoch": 0.1957458245917918, "grad_norm": 0.291015625, "grad_norm_var": 0.0019229729970296225, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.464175820350647, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18875881284475327, "step": 13109 }, { "epoch": 0.19576075676240676, "grad_norm": 0.3046875, "grad_norm_var": 0.0013889153798421225, "learning_rate": 0.0001, "loss": 1.3912, "loss/crossentropy": 2.8361796140670776, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.2154316008090973, "step": 13110 }, { "epoch": 0.19577568893302175, "grad_norm": 0.25, "grad_norm_var": 0.001456451416015625, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.511905312538147, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.176968514919281, "step": 13111 }, { "epoch": 0.19579062110363674, "grad_norm": 0.275390625, "grad_norm_var": 0.0014739990234375, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.5771450996398926, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1819954589009285, "step": 13112 }, { "epoch": 0.1958055532742517, "grad_norm": 0.294921875, "grad_norm_var": 0.0014666239420572916, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.6763157844543457, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.2213890329003334, "step": 13113 }, { "epoch": 0.1958204854448667, "grad_norm": 0.310546875, "grad_norm_var": 0.001383193333943685, "learning_rate": 0.0001, "loss": 1.4364, "loss/crossentropy": 2.7291805744171143, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.20204465836286545, "step": 13114 }, { "epoch": 0.19583541761548168, "grad_norm": 0.546875, "grad_norm_var": 0.004781850179036458, "learning_rate": 0.0001, "loss": 1.8223, "loss/crossentropy": 2.5902230739593506, "loss/fcd": 1.51953125, "loss/idx": 11.0, "loss/logits": 0.3027857840061188, "step": 13115 }, { "epoch": 0.19585034978609667, "grad_norm": 0.27734375, "grad_norm_var": 0.004817565282185872, "learning_rate": 0.0001, "loss": 1.2987, "loss/crossentropy": 2.5582363605499268, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1659296154975891, "step": 13116 }, { "epoch": 0.19586528195671163, "grad_norm": 0.267578125, "grad_norm_var": 0.004883686701456706, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.3709076642990112, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.15556833148002625, "step": 13117 }, { "epoch": 0.19588021412732662, "grad_norm": 0.37109375, "grad_norm_var": 0.004942766825358073, "learning_rate": 0.0001, "loss": 1.6057, "loss/crossentropy": 2.849392294883728, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.2268376350402832, "step": 13118 }, { "epoch": 0.1958951462979416, "grad_norm": 0.30078125, "grad_norm_var": 0.004787635803222656, "learning_rate": 0.0001, "loss": 1.4254, "loss/crossentropy": 2.688052535057068, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2105259969830513, "step": 13119 }, { "epoch": 0.19591007846855657, "grad_norm": 0.28125, "grad_norm_var": 0.00481561024983724, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.642505645751953, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1798175424337387, "step": 13120 }, { "epoch": 0.19592501063917156, "grad_norm": 0.408203125, "grad_norm_var": 0.005369170506795248, "learning_rate": 0.0001, "loss": 1.773, "loss/crossentropy": 2.560065746307373, "loss/fcd": 1.5234375, "loss/idx": 11.0, "loss/logits": 0.24953554570674896, "step": 13121 }, { "epoch": 0.19593994280978655, "grad_norm": 0.314453125, "grad_norm_var": 0.005301650365193685, "learning_rate": 0.0001, "loss": 1.4322, "loss/crossentropy": 2.5453373193740845, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2212313860654831, "step": 13122 }, { "epoch": 0.19595487498040154, "grad_norm": 0.291015625, "grad_norm_var": 0.00531915028889974, "learning_rate": 0.0001, "loss": 1.4061, "loss/crossentropy": 2.602495312690735, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19903133064508438, "step": 13123 }, { "epoch": 0.1959698071510165, "grad_norm": 0.29296875, "grad_norm_var": 0.005231730143229167, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.6820499897003174, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1956019252538681, "step": 13124 }, { "epoch": 0.1959847393216315, "grad_norm": 0.359375, "grad_norm_var": 0.005283466974894206, "learning_rate": 0.0001, "loss": 1.4453, "loss/crossentropy": 2.73381507396698, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19534368813037872, "step": 13125 }, { "epoch": 0.19599967149224648, "grad_norm": 0.287109375, "grad_norm_var": 0.005342547098795573, "learning_rate": 0.0001, "loss": 1.5703, "loss/crossentropy": 2.2293648719787598, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.22656994313001633, "step": 13126 }, { "epoch": 0.19601460366286144, "grad_norm": 0.283203125, "grad_norm_var": 0.005099089940388998, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.5369762182235718, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18175002932548523, "step": 13127 }, { "epoch": 0.19602953583347643, "grad_norm": 0.4609375, "grad_norm_var": 0.00608208974202474, "learning_rate": 0.0001, "loss": 1.6434, "loss/crossentropy": 2.6869983673095703, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.24500460922718048, "step": 13128 }, { "epoch": 0.19604446800409142, "grad_norm": 0.26953125, "grad_norm_var": 0.006255451838175456, "learning_rate": 0.0001, "loss": 1.2551, "loss/crossentropy": 2.5867550373077393, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.16523216664791107, "step": 13129 }, { "epoch": 0.19605940017470638, "grad_norm": 0.255859375, "grad_norm_var": 0.0066034793853759766, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.566839337348938, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17305132746696472, "step": 13130 }, { "epoch": 0.19607433234532137, "grad_norm": 0.3203125, "grad_norm_var": 0.003236754735310872, "learning_rate": 0.0001, "loss": 1.3048, "loss/crossentropy": 2.6492520570755005, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1719408556818962, "step": 13131 }, { "epoch": 0.19608926451593636, "grad_norm": 0.28515625, "grad_norm_var": 0.00320127805074056, "learning_rate": 0.0001, "loss": 1.3912, "loss/crossentropy": 2.455108880996704, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1880672425031662, "step": 13132 }, { "epoch": 0.19610419668655135, "grad_norm": 0.326171875, "grad_norm_var": 0.00304106076558431, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.375242233276367, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.19768472760915756, "step": 13133 }, { "epoch": 0.1961191288571663, "grad_norm": 0.30859375, "grad_norm_var": 0.0028528690338134764, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.6366273164749146, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.1978730410337448, "step": 13134 }, { "epoch": 0.1961340610277813, "grad_norm": 0.28125, "grad_norm_var": 0.002914539972941081, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.530272126197815, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.2003946229815483, "step": 13135 }, { "epoch": 0.1961489931983963, "grad_norm": 0.30078125, "grad_norm_var": 0.0028528690338134764, "learning_rate": 0.0001, "loss": 1.2873, "loss/crossentropy": 2.3425244092941284, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16225451976060867, "step": 13136 }, { "epoch": 0.19616392536901125, "grad_norm": 0.29296875, "grad_norm_var": 0.0022555033365885416, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.413888931274414, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.23623421788215637, "step": 13137 }, { "epoch": 0.19617885753962624, "grad_norm": 0.46875, "grad_norm_var": 0.003874063491821289, "learning_rate": 0.0001, "loss": 1.5818, "loss/crossentropy": 2.571226716041565, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.22240641713142395, "step": 13138 }, { "epoch": 0.19619378971024123, "grad_norm": 0.283203125, "grad_norm_var": 0.0039057254791259764, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.4723037481307983, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.15282511711120605, "step": 13139 }, { "epoch": 0.19620872188085622, "grad_norm": 0.28515625, "grad_norm_var": 0.003934844334920248, "learning_rate": 0.0001, "loss": 1.4334, "loss/crossentropy": 2.364640235900879, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20294228941202164, "step": 13140 }, { "epoch": 0.19622365405147119, "grad_norm": 0.291015625, "grad_norm_var": 0.0038386027018229166, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.692529082298279, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18361011147499084, "step": 13141 }, { "epoch": 0.19623858622208618, "grad_norm": 0.28515625, "grad_norm_var": 0.0038454532623291016, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.483832359313965, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.15513406693935394, "step": 13142 }, { "epoch": 0.19625351839270117, "grad_norm": 0.2734375, "grad_norm_var": 0.0038894017537434897, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.5041663646698, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19347304105758667, "step": 13143 }, { "epoch": 0.19626845056331613, "grad_norm": 0.39453125, "grad_norm_var": 0.00284423828125, "learning_rate": 0.0001, "loss": 1.6522, "loss/crossentropy": 2.641385793685913, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.26937900483608246, "step": 13144 }, { "epoch": 0.19628338273393112, "grad_norm": 0.314453125, "grad_norm_var": 0.0027422428131103514, "learning_rate": 0.0001, "loss": 1.4181, "loss/crossentropy": 2.526704788208008, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19936396181583405, "step": 13145 }, { "epoch": 0.1962983149045461, "grad_norm": 0.349609375, "grad_norm_var": 0.0026094913482666016, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.42119300365448, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.20764151960611343, "step": 13146 }, { "epoch": 0.19631324707516107, "grad_norm": 0.33984375, "grad_norm_var": 0.0026438236236572266, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.526168704032898, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1994517594575882, "step": 13147 }, { "epoch": 0.19632817924577606, "grad_norm": 0.3125, "grad_norm_var": 0.00257261594136556, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.752177596092224, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19045475870370865, "step": 13148 }, { "epoch": 0.19634311141639105, "grad_norm": 0.625, "grad_norm_var": 0.008430989583333333, "learning_rate": 0.0001, "loss": 1.4402, "loss/crossentropy": 2.7439545392990112, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20190976560115814, "step": 13149 }, { "epoch": 0.19635804358700604, "grad_norm": 0.265625, "grad_norm_var": 0.008714230855305989, "learning_rate": 0.0001, "loss": 1.3209, "loss/crossentropy": 2.605916738510132, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18030058592557907, "step": 13150 }, { "epoch": 0.196372975757621, "grad_norm": 0.3203125, "grad_norm_var": 0.008528582255045573, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.537140727043152, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21480635553598404, "step": 13151 }, { "epoch": 0.196387907928236, "grad_norm": 0.328125, "grad_norm_var": 0.008440907796223958, "learning_rate": 0.0001, "loss": 1.4084, "loss/crossentropy": 2.5959140062332153, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1896708905696869, "step": 13152 }, { "epoch": 0.19640284009885098, "grad_norm": 0.3125, "grad_norm_var": 0.008343950907389323, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.535669207572937, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2121533825993538, "step": 13153 }, { "epoch": 0.19641777226946594, "grad_norm": 0.314453125, "grad_norm_var": 0.007195011774698893, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.7049777507781982, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21881242841482162, "step": 13154 }, { "epoch": 0.19643270444008093, "grad_norm": 0.30859375, "grad_norm_var": 0.007073720296223958, "learning_rate": 0.0001, "loss": 1.4495, "loss/crossentropy": 2.7972599267959595, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.22295144200325012, "step": 13155 }, { "epoch": 0.19644763661069592, "grad_norm": 0.3203125, "grad_norm_var": 0.00692895253499349, "learning_rate": 0.0001, "loss": 1.351, "loss/crossentropy": 2.296112537384033, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.16743365675210953, "step": 13156 }, { "epoch": 0.1964625687813109, "grad_norm": 0.318359375, "grad_norm_var": 0.006816355387369791, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.561015248298645, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18501178175210953, "step": 13157 }, { "epoch": 0.19647750095192587, "grad_norm": 0.25, "grad_norm_var": 0.0071339289347330725, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.5988799333572388, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17181964218616486, "step": 13158 }, { "epoch": 0.19649243312254086, "grad_norm": 0.2890625, "grad_norm_var": 0.007022539774576823, "learning_rate": 0.0001, "loss": 1.2778, "loss/crossentropy": 2.599443197250366, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16065717488527298, "step": 13159 }, { "epoch": 0.19650736529315585, "grad_norm": 0.28515625, "grad_norm_var": 0.0069050470987955725, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.6899322271347046, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18560713529586792, "step": 13160 }, { "epoch": 0.1965222974637708, "grad_norm": 0.322265625, "grad_norm_var": 0.006894365946451823, "learning_rate": 0.0001, "loss": 1.4611, "loss/crossentropy": 2.8099868297576904, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20719966292381287, "step": 13161 }, { "epoch": 0.1965372296343858, "grad_norm": 0.26171875, "grad_norm_var": 0.007133976618448893, "learning_rate": 0.0001, "loss": 1.3233, "loss/crossentropy": 2.6828558444976807, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.18657036125659943, "step": 13162 }, { "epoch": 0.1965521618050008, "grad_norm": 0.71484375, "grad_norm_var": 0.016747013727823893, "learning_rate": 0.0001, "loss": 1.6178, "loss/crossentropy": 2.4198286533355713, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2701701521873474, "step": 13163 }, { "epoch": 0.19656709397561575, "grad_norm": 0.330078125, "grad_norm_var": 0.016685930887858073, "learning_rate": 0.0001, "loss": 1.3954, "loss/crossentropy": 2.7764079570770264, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18838191032409668, "step": 13164 }, { "epoch": 0.19658202614623074, "grad_norm": 0.29296875, "grad_norm_var": 0.011308797200520833, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 2.6331472396850586, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18284057080745697, "step": 13165 }, { "epoch": 0.19659695831684573, "grad_norm": 0.31640625, "grad_norm_var": 0.01105340321858724, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.768945097923279, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2274034395813942, "step": 13166 }, { "epoch": 0.19661189048746072, "grad_norm": 0.28125, "grad_norm_var": 0.011200904846191406, "learning_rate": 0.0001, "loss": 1.3661, "loss/crossentropy": 2.4149348735809326, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19028045237064362, "step": 13167 }, { "epoch": 0.19662682265807568, "grad_norm": 0.302734375, "grad_norm_var": 0.011240371068318685, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.618352174758911, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1849839836359024, "step": 13168 }, { "epoch": 0.19664175482869067, "grad_norm": 0.2734375, "grad_norm_var": 0.011407581965128581, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.275009870529175, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17021000385284424, "step": 13169 }, { "epoch": 0.19665668699930566, "grad_norm": 0.275390625, "grad_norm_var": 0.011551904678344726, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.587579607963562, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1715787947177887, "step": 13170 }, { "epoch": 0.19667161916992063, "grad_norm": 0.294921875, "grad_norm_var": 0.011586952209472656, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.5751763582229614, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17566537857055664, "step": 13171 }, { "epoch": 0.19668655134053561, "grad_norm": 0.390625, "grad_norm_var": 0.011893653869628906, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.6657696962356567, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.18887481838464737, "step": 13172 }, { "epoch": 0.1967014835111506, "grad_norm": 0.27734375, "grad_norm_var": 0.012034845352172852, "learning_rate": 0.0001, "loss": 1.4288, "loss/crossentropy": 2.6180540323257446, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20224083214998245, "step": 13173 }, { "epoch": 0.19671641568176557, "grad_norm": 0.330078125, "grad_norm_var": 0.011662737528483073, "learning_rate": 0.0001, "loss": 1.502, "loss/crossentropy": 2.9029461145401, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.23634785413742065, "step": 13174 }, { "epoch": 0.19673134785238056, "grad_norm": 0.25390625, "grad_norm_var": 0.011919657389322916, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.5019052028656006, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18421922624111176, "step": 13175 }, { "epoch": 0.19674628002299555, "grad_norm": 0.296875, "grad_norm_var": 0.011865679423014324, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.6321024894714355, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17559963464736938, "step": 13176 }, { "epoch": 0.19676121219361054, "grad_norm": 0.291015625, "grad_norm_var": 0.011941973368326824, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.770920753479004, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1698579639196396, "step": 13177 }, { "epoch": 0.1967761443642255, "grad_norm": 0.3671875, "grad_norm_var": 0.011761728922526042, "learning_rate": 0.0001, "loss": 1.8499, "loss/crossentropy": 2.629474639892578, "loss/fcd": 1.54296875, "loss/idx": 11.0, "loss/logits": 0.30689041316509247, "step": 13178 }, { "epoch": 0.1967910765348405, "grad_norm": 0.29296875, "grad_norm_var": 0.0012697855631510417, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.691903829574585, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19473834335803986, "step": 13179 }, { "epoch": 0.19680600870545548, "grad_norm": 0.306640625, "grad_norm_var": 0.0012232462565104167, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.709852695465088, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1689949631690979, "step": 13180 }, { "epoch": 0.19682094087607044, "grad_norm": 0.314453125, "grad_norm_var": 0.0012241204579671223, "learning_rate": 0.0001, "loss": 1.4367, "loss/crossentropy": 2.7557992935180664, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2023526132106781, "step": 13181 }, { "epoch": 0.19683587304668543, "grad_norm": 0.302734375, "grad_norm_var": 0.0012133280436197917, "learning_rate": 0.0001, "loss": 1.4739, "loss/crossentropy": 2.4741861820220947, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21604853868484497, "step": 13182 }, { "epoch": 0.19685080521730042, "grad_norm": 0.408203125, "grad_norm_var": 0.0018487135569254557, "learning_rate": 0.0001, "loss": 2.1456, "loss/crossentropy": 2.469208240509033, "loss/fcd": 1.69140625, "loss/idx": 11.0, "loss/logits": 0.45415031909942627, "step": 13183 }, { "epoch": 0.1968657373879154, "grad_norm": 0.345703125, "grad_norm_var": 0.0019158522288004558, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.8031846284866333, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17718569189310074, "step": 13184 }, { "epoch": 0.19688066955853037, "grad_norm": 0.546875, "grad_norm_var": 0.0051157474517822266, "learning_rate": 0.0001, "loss": 1.5237, "loss/crossentropy": 2.703389525413513, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.226799875497818, "step": 13185 }, { "epoch": 0.19689560172914536, "grad_norm": 0.298828125, "grad_norm_var": 0.004976511001586914, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.6349653005599976, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19734369963407516, "step": 13186 }, { "epoch": 0.19691053389976035, "grad_norm": 0.298828125, "grad_norm_var": 0.004957946141560873, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.527416944503784, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21837803721427917, "step": 13187 }, { "epoch": 0.1969254660703753, "grad_norm": 0.298828125, "grad_norm_var": 0.004774920145670573, "learning_rate": 0.0001, "loss": 1.3651, "loss/crossentropy": 2.6279326677322388, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18541865795850754, "step": 13188 }, { "epoch": 0.1969403982409903, "grad_norm": 0.26953125, "grad_norm_var": 0.004830360412597656, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 2.3676384687423706, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1656162366271019, "step": 13189 }, { "epoch": 0.1969553304116053, "grad_norm": 0.296875, "grad_norm_var": 0.0048830509185791016, "learning_rate": 0.0001, "loss": 1.3669, "loss/crossentropy": 2.347358465194702, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17939550429582596, "step": 13190 }, { "epoch": 0.19697026258222025, "grad_norm": 0.263671875, "grad_norm_var": 0.004797299702962239, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.53109347820282, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18308822065591812, "step": 13191 }, { "epoch": 0.19698519475283524, "grad_norm": 0.271484375, "grad_norm_var": 0.0049326419830322266, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.3423057794570923, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.22143225371837616, "step": 13192 }, { "epoch": 0.19700012692345023, "grad_norm": 0.26953125, "grad_norm_var": 0.005054155985514323, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.803149461746216, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.17451299726963043, "step": 13193 }, { "epoch": 0.19701505909406522, "grad_norm": 0.294921875, "grad_norm_var": 0.00494535764058431, "learning_rate": 0.0001, "loss": 1.4886, "loss/crossentropy": 2.765531301498413, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.2152080535888672, "step": 13194 }, { "epoch": 0.19702999126468018, "grad_norm": 0.3203125, "grad_norm_var": 0.00490263303120931, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.7167216539382935, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.22341838479042053, "step": 13195 }, { "epoch": 0.19704492343529517, "grad_norm": 0.30078125, "grad_norm_var": 0.004914601643880208, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.660623073577881, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20098990201950073, "step": 13196 }, { "epoch": 0.19705985560591016, "grad_norm": 0.326171875, "grad_norm_var": 0.0049163182576497395, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.7784451246261597, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.2341885268688202, "step": 13197 }, { "epoch": 0.19707478777652512, "grad_norm": 0.28515625, "grad_norm_var": 0.004975112279256185, "learning_rate": 0.0001, "loss": 1.3491, "loss/crossentropy": 2.481260061264038, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17719075828790665, "step": 13198 }, { "epoch": 0.1970897199471401, "grad_norm": 0.291015625, "grad_norm_var": 0.004431517918904623, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.562163829803467, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.24975963681936264, "step": 13199 }, { "epoch": 0.1971046521177551, "grad_norm": 0.287109375, "grad_norm_var": 0.00437620480855306, "learning_rate": 0.0001, "loss": 1.4196, "loss/crossentropy": 2.626181483268738, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20871196687221527, "step": 13200 }, { "epoch": 0.1971195842883701, "grad_norm": 0.271484375, "grad_norm_var": 0.00032647450764973957, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.264981269836426, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17986155301332474, "step": 13201 }, { "epoch": 0.19713451645898505, "grad_norm": 0.447265625, "grad_norm_var": 0.001872698465983073, "learning_rate": 0.0001, "loss": 1.8668, "loss/crossentropy": 2.3471877574920654, "loss/fcd": 1.62890625, "loss/idx": 11.0, "loss/logits": 0.2378838211297989, "step": 13202 }, { "epoch": 0.19714944862960004, "grad_norm": 0.28515625, "grad_norm_var": 0.0018857161204020182, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.6635314226150513, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18388742208480835, "step": 13203 }, { "epoch": 0.19716438080021503, "grad_norm": 0.3828125, "grad_norm_var": 0.0023279190063476562, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.5560413599014282, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.187274731695652, "step": 13204 }, { "epoch": 0.19717931297083, "grad_norm": 0.30859375, "grad_norm_var": 0.0022439956665039062, "learning_rate": 0.0001, "loss": 1.3839, "loss/crossentropy": 2.460902690887451, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.18081119656562805, "step": 13205 }, { "epoch": 0.19719424514144498, "grad_norm": 0.27734375, "grad_norm_var": 0.002292633056640625, "learning_rate": 0.0001, "loss": 1.3342, "loss/crossentropy": 2.472321391105652, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18188952654600143, "step": 13206 }, { "epoch": 0.19720917731205997, "grad_norm": 0.306640625, "grad_norm_var": 0.0021702448527018228, "learning_rate": 0.0001, "loss": 1.3166, "loss/crossentropy": 2.64757764339447, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1798725426197052, "step": 13207 }, { "epoch": 0.19722410948267494, "grad_norm": 0.353515625, "grad_norm_var": 0.0021929423014322917, "learning_rate": 0.0001, "loss": 1.5565, "loss/crossentropy": 2.7299180030822754, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.20888099074363708, "step": 13208 }, { "epoch": 0.19723904165328993, "grad_norm": 0.26953125, "grad_norm_var": 0.0021929423014322917, "learning_rate": 0.0001, "loss": 1.3359, "loss/crossentropy": 2.6009708642959595, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1835147589445114, "step": 13209 }, { "epoch": 0.19725397382390492, "grad_norm": 0.2890625, "grad_norm_var": 0.0022092024485270184, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 2.52272367477417, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19094198942184448, "step": 13210 }, { "epoch": 0.1972689059945199, "grad_norm": 0.326171875, "grad_norm_var": 0.0022173563639322916, "learning_rate": 0.0001, "loss": 1.5935, "loss/crossentropy": 2.618382453918457, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.2380671501159668, "step": 13211 }, { "epoch": 0.19728383816513487, "grad_norm": 0.392578125, "grad_norm_var": 0.002594614028930664, "learning_rate": 0.0001, "loss": 1.7472, "loss/crossentropy": 2.323714256286621, "loss/fcd": 1.50390625, "loss/idx": 11.0, "loss/logits": 0.24326357245445251, "step": 13212 }, { "epoch": 0.19729877033574986, "grad_norm": 0.298828125, "grad_norm_var": 0.0026141961415608725, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.5144048929214478, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1965983882546425, "step": 13213 }, { "epoch": 0.19731370250636485, "grad_norm": 0.283203125, "grad_norm_var": 0.0026227315266927082, "learning_rate": 0.0001, "loss": 1.3533, "loss/crossentropy": 2.6588646173477173, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19310042262077332, "step": 13214 }, { "epoch": 0.1973286346769798, "grad_norm": 0.3125, "grad_norm_var": 0.0025774478912353516, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.677197217941284, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19661054015159607, "step": 13215 }, { "epoch": 0.1973435668475948, "grad_norm": 0.3359375, "grad_norm_var": 0.0025238037109375, "learning_rate": 0.0001, "loss": 1.4443, "loss/crossentropy": 2.566201329231262, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20207378268241882, "step": 13216 }, { "epoch": 0.1973584990182098, "grad_norm": 0.36328125, "grad_norm_var": 0.0024408817291259764, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.4074233770370483, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19209985435009003, "step": 13217 }, { "epoch": 0.19737343118882478, "grad_norm": 0.30078125, "grad_norm_var": 0.001433563232421875, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.6558181047439575, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19158418476581573, "step": 13218 }, { "epoch": 0.19738836335943974, "grad_norm": 0.2734375, "grad_norm_var": 0.0014932632446289062, "learning_rate": 0.0001, "loss": 1.373, "loss/crossentropy": 2.6290690898895264, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19334295392036438, "step": 13219 }, { "epoch": 0.19740329553005473, "grad_norm": 0.40234375, "grad_norm_var": 0.0016881306966145834, "learning_rate": 0.0001, "loss": 1.784, "loss/crossentropy": 2.404885768890381, "loss/fcd": 1.5078125, "loss/idx": 11.0, "loss/logits": 0.2762131243944168, "step": 13220 }, { "epoch": 0.19741822770066972, "grad_norm": 0.294921875, "grad_norm_var": 0.0017176151275634765, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.442149519920349, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20038705319166183, "step": 13221 }, { "epoch": 0.19743315987128468, "grad_norm": 0.296875, "grad_norm_var": 0.0016368707021077475, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.4883267879486084, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2105884552001953, "step": 13222 }, { "epoch": 0.19744809204189967, "grad_norm": 0.333984375, "grad_norm_var": 0.001639540990193685, "learning_rate": 0.0001, "loss": 1.5815, "loss/crossentropy": 2.560744285583496, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.22209949791431427, "step": 13223 }, { "epoch": 0.19746302421251466, "grad_norm": 0.30078125, "grad_norm_var": 0.0015807469685872396, "learning_rate": 0.0001, "loss": 1.3689, "loss/crossentropy": 2.7280306816101074, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18531429767608643, "step": 13224 }, { "epoch": 0.19747795638312962, "grad_norm": 0.275390625, "grad_norm_var": 0.0015456994374593099, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.429328203201294, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1781943142414093, "step": 13225 }, { "epoch": 0.1974928885537446, "grad_norm": 0.283203125, "grad_norm_var": 0.0015700658162434895, "learning_rate": 0.0001, "loss": 1.4031, "loss/crossentropy": 2.6779803037643433, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.21173611283302307, "step": 13226 }, { "epoch": 0.1975078207243596, "grad_norm": 0.30078125, "grad_norm_var": 0.0015797773996988932, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.5700948238372803, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20784726738929749, "step": 13227 }, { "epoch": 0.1975227528949746, "grad_norm": 0.2890625, "grad_norm_var": 0.001186370849609375, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.533159017562866, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19103019684553146, "step": 13228 }, { "epoch": 0.19753768506558955, "grad_norm": 0.2734375, "grad_norm_var": 0.0012613773345947266, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.5497519969940186, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2091936320066452, "step": 13229 }, { "epoch": 0.19755261723620454, "grad_norm": 0.318359375, "grad_norm_var": 0.0012247562408447266, "learning_rate": 0.0001, "loss": 1.4464, "loss/crossentropy": 2.5034717321395874, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20419898629188538, "step": 13230 }, { "epoch": 0.19756754940681953, "grad_norm": 0.31640625, "grad_norm_var": 0.0012271722157796224, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.602152109146118, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1956464722752571, "step": 13231 }, { "epoch": 0.1975824815774345, "grad_norm": 0.345703125, "grad_norm_var": 0.0012669881184895833, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.803423523902893, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19693507999181747, "step": 13232 }, { "epoch": 0.19759741374804948, "grad_norm": 0.296875, "grad_norm_var": 0.0010756810506184896, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.4413565397262573, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17370712757110596, "step": 13233 }, { "epoch": 0.19761234591866447, "grad_norm": 0.306640625, "grad_norm_var": 0.0010734399159749348, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.5050348043441772, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18639858812093735, "step": 13234 }, { "epoch": 0.19762727808927943, "grad_norm": 0.32421875, "grad_norm_var": 0.0010089715321858724, "learning_rate": 0.0001, "loss": 1.5249, "loss/crossentropy": 2.438870072364807, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.21625643223524094, "step": 13235 }, { "epoch": 0.19764221025989442, "grad_norm": 0.27734375, "grad_norm_var": 0.0004454135894775391, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.6454282999038696, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1872101128101349, "step": 13236 }, { "epoch": 0.1976571424305094, "grad_norm": 0.30859375, "grad_norm_var": 0.0004439671834309896, "learning_rate": 0.0001, "loss": 1.4647, "loss/crossentropy": 2.3971952199935913, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20294788479804993, "step": 13237 }, { "epoch": 0.1976720746011244, "grad_norm": 0.291015625, "grad_norm_var": 0.0004508813222249349, "learning_rate": 0.0001, "loss": 1.3484, "loss/crossentropy": 2.6019245386123657, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18047110736370087, "step": 13238 }, { "epoch": 0.19768700677173937, "grad_norm": 0.291015625, "grad_norm_var": 0.0003865400950113932, "learning_rate": 0.0001, "loss": 1.4202, "loss/crossentropy": 2.668920159339905, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1936056762933731, "step": 13239 }, { "epoch": 0.19770193894235435, "grad_norm": 0.357421875, "grad_norm_var": 0.0005935033162434896, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.249101161956787, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2522211968898773, "step": 13240 }, { "epoch": 0.19771687111296934, "grad_norm": 0.3046875, "grad_norm_var": 0.0005374749501546224, "learning_rate": 0.0001, "loss": 1.4685, "loss/crossentropy": 2.5523310899734497, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2067813128232956, "step": 13241 }, { "epoch": 0.1977318032835843, "grad_norm": 0.294921875, "grad_norm_var": 0.0005115350087483724, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.6022346019744873, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21655553579330444, "step": 13242 }, { "epoch": 0.1977467354541993, "grad_norm": 0.314453125, "grad_norm_var": 0.0005136489868164063, "learning_rate": 0.0001, "loss": 1.4507, "loss/crossentropy": 2.652217149734497, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21636473387479782, "step": 13243 }, { "epoch": 0.19776166762481429, "grad_norm": 0.298828125, "grad_norm_var": 0.0004964033762613932, "learning_rate": 0.0001, "loss": 1.396, "loss/crossentropy": 2.606067419052124, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1929040104150772, "step": 13244 }, { "epoch": 0.19777659979542928, "grad_norm": 0.2421875, "grad_norm_var": 0.0006993452707926433, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.483457922935486, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.2055337280035019, "step": 13245 }, { "epoch": 0.19779153196604424, "grad_norm": 0.2490234375, "grad_norm_var": 0.0008813182512919109, "learning_rate": 0.0001, "loss": 1.2466, "loss/crossentropy": 2.514565348625183, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.15680017322301865, "step": 13246 }, { "epoch": 0.19780646413665923, "grad_norm": 0.310546875, "grad_norm_var": 0.0008715907732645671, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.9649477005004883, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.19708993285894394, "step": 13247 }, { "epoch": 0.19782139630727422, "grad_norm": 0.3515625, "grad_norm_var": 0.0009087840716044108, "learning_rate": 0.0001, "loss": 1.6367, "loss/crossentropy": 2.406627058982849, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.23437880724668503, "step": 13248 }, { "epoch": 0.19783632847788918, "grad_norm": 0.322265625, "grad_norm_var": 0.0009344061215718587, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.6359310150146484, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.17959249019622803, "step": 13249 }, { "epoch": 0.19785126064850417, "grad_norm": 0.294921875, "grad_norm_var": 0.0009369810422261555, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.795200824737549, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2124321460723877, "step": 13250 }, { "epoch": 0.19786619281911916, "grad_norm": 0.30859375, "grad_norm_var": 0.0009060819943745931, "learning_rate": 0.0001, "loss": 1.3576, "loss/crossentropy": 2.7177789211273193, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18572969734668732, "step": 13251 }, { "epoch": 0.19788112498973412, "grad_norm": 0.27734375, "grad_norm_var": 0.0009060819943745931, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.6191678047180176, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.21587973833084106, "step": 13252 }, { "epoch": 0.1978960571603491, "grad_norm": 0.27734375, "grad_norm_var": 0.0009358366330464681, "learning_rate": 0.0001, "loss": 1.399, "loss/crossentropy": 2.4347022771835327, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1880674585700035, "step": 13253 }, { "epoch": 0.1979109893309641, "grad_norm": 0.314453125, "grad_norm_var": 0.0009448011716206868, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.823094964027405, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21288764476776123, "step": 13254 }, { "epoch": 0.1979259215015791, "grad_norm": 0.296875, "grad_norm_var": 0.0009394605954488118, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.3309096097946167, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19409368932247162, "step": 13255 }, { "epoch": 0.19794085367219405, "grad_norm": 0.6328125, "grad_norm_var": 0.007752509911855062, "learning_rate": 0.0001, "loss": 1.6279, "loss/crossentropy": 2.619774580001831, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.23335997015237808, "step": 13256 }, { "epoch": 0.19795578584280904, "grad_norm": 0.291015625, "grad_norm_var": 0.0077887813250223795, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.5995218753814697, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2371544912457466, "step": 13257 }, { "epoch": 0.19797071801342403, "grad_norm": 0.37109375, "grad_norm_var": 0.007923916975657145, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.536817789077759, "loss/fcd": 1.39453125, "loss/idx": 11.0, "loss/logits": 0.24924490600824356, "step": 13258 }, { "epoch": 0.197985650184039, "grad_norm": 0.267578125, "grad_norm_var": 0.008108929793039957, "learning_rate": 0.0001, "loss": 1.5365, "loss/crossentropy": 2.336771607398987, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.22786425054073334, "step": 13259 }, { "epoch": 0.19800058235465398, "grad_norm": 0.27734375, "grad_norm_var": 0.008196000258127849, "learning_rate": 0.0001, "loss": 1.3384, "loss/crossentropy": 2.6552069187164307, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18608994036912918, "step": 13260 }, { "epoch": 0.19801551452526897, "grad_norm": 0.283203125, "grad_norm_var": 0.007887581984202066, "learning_rate": 0.0001, "loss": 1.4067, "loss/crossentropy": 2.7702406644821167, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19961939007043839, "step": 13261 }, { "epoch": 0.19803044669588396, "grad_norm": 0.34375, "grad_norm_var": 0.007547235488891602, "learning_rate": 0.0001, "loss": 1.5959, "loss/crossentropy": 2.631529927253723, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.24044479429721832, "step": 13262 }, { "epoch": 0.19804537886649892, "grad_norm": 0.3125, "grad_norm_var": 0.007543373107910156, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.68074893951416, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21497581154108047, "step": 13263 }, { "epoch": 0.1980603110371139, "grad_norm": 0.26953125, "grad_norm_var": 0.00768890380859375, "learning_rate": 0.0001, "loss": 1.4217, "loss/crossentropy": 2.6021971702575684, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19908349215984344, "step": 13264 }, { "epoch": 0.1980752432077289, "grad_norm": 0.279296875, "grad_norm_var": 0.00779870351155599, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.510871171951294, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2063944935798645, "step": 13265 }, { "epoch": 0.19809017537834386, "grad_norm": 0.306640625, "grad_norm_var": 0.0077702840169270836, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.6384105682373047, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20882797241210938, "step": 13266 }, { "epoch": 0.19810510754895885, "grad_norm": 0.275390625, "grad_norm_var": 0.007886743545532227, "learning_rate": 0.0001, "loss": 1.4203, "loss/crossentropy": 2.726689338684082, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2172176018357277, "step": 13267 }, { "epoch": 0.19812003971957384, "grad_norm": 0.3515625, "grad_norm_var": 0.007836008071899414, "learning_rate": 0.0001, "loss": 1.581, "loss/crossentropy": 2.4158791303634644, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.2293892651796341, "step": 13268 }, { "epoch": 0.1981349718901888, "grad_norm": 0.298828125, "grad_norm_var": 0.007737223307291667, "learning_rate": 0.0001, "loss": 1.3289, "loss/crossentropy": 2.6307986974716187, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17659807205200195, "step": 13269 }, { "epoch": 0.1981499040608038, "grad_norm": 0.314453125, "grad_norm_var": 0.007737223307291667, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.659094214439392, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22770410776138306, "step": 13270 }, { "epoch": 0.19816483623141878, "grad_norm": 0.322265625, "grad_norm_var": 0.007688252131144205, "learning_rate": 0.0001, "loss": 1.551, "loss/crossentropy": 2.6533981561660767, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22675758600234985, "step": 13271 }, { "epoch": 0.19817976840203377, "grad_norm": 0.31640625, "grad_norm_var": 0.0009522596995035807, "learning_rate": 0.0001, "loss": 1.5269, "loss/crossentropy": 2.3546783924102783, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.22613851726055145, "step": 13272 }, { "epoch": 0.19819470057264874, "grad_norm": 0.271484375, "grad_norm_var": 0.0010126590728759765, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.7399812936782837, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17215919494628906, "step": 13273 }, { "epoch": 0.19820963274326373, "grad_norm": 0.390625, "grad_norm_var": 0.0012116591135660807, "learning_rate": 0.0001, "loss": 1.7211, "loss/crossentropy": 2.3842915296554565, "loss/fcd": 1.4765625, "loss/idx": 11.0, "loss/logits": 0.24452316761016846, "step": 13274 }, { "epoch": 0.19822456491387871, "grad_norm": 0.330078125, "grad_norm_var": 0.001143503189086914, "learning_rate": 0.0001, "loss": 1.4522, "loss/crossentropy": 2.857653260231018, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21393553912639618, "step": 13275 }, { "epoch": 0.19823949708449368, "grad_norm": 0.291015625, "grad_norm_var": 0.0010975519816080728, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.5494165420532227, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20116853713989258, "step": 13276 }, { "epoch": 0.19825442925510867, "grad_norm": 0.30078125, "grad_norm_var": 0.0010544935862223308, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.2843655347824097, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1733878254890442, "step": 13277 }, { "epoch": 0.19826936142572366, "grad_norm": 0.251953125, "grad_norm_var": 0.0011792500813802083, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.4768881797790527, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.16341090202331543, "step": 13278 }, { "epoch": 0.19828429359633865, "grad_norm": 0.3359375, "grad_norm_var": 0.0012364705403645833, "learning_rate": 0.0001, "loss": 1.6964, "loss/crossentropy": 2.2241514921188354, "loss/fcd": 1.44921875, "loss/idx": 11.0, "loss/logits": 0.24720606207847595, "step": 13279 }, { "epoch": 0.1982992257669536, "grad_norm": 0.3046875, "grad_norm_var": 0.0011397679646809897, "learning_rate": 0.0001, "loss": 1.2626, "loss/crossentropy": 2.710598587989807, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1649702861905098, "step": 13280 }, { "epoch": 0.1983141579375686, "grad_norm": 0.306640625, "grad_norm_var": 0.00107879638671875, "learning_rate": 0.0001, "loss": 1.4991, "loss/crossentropy": 2.680593729019165, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22174976766109467, "step": 13281 }, { "epoch": 0.1983290901081836, "grad_norm": 0.375, "grad_norm_var": 0.0013352553049723306, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.6101720333099365, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.22326326370239258, "step": 13282 }, { "epoch": 0.19834402227879855, "grad_norm": 0.31640625, "grad_norm_var": 0.0012247721354166667, "learning_rate": 0.0001, "loss": 1.5981, "loss/crossentropy": 2.505405902862549, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.23479685187339783, "step": 13283 }, { "epoch": 0.19835895444941354, "grad_norm": 0.328125, "grad_norm_var": 0.0011522928873697917, "learning_rate": 0.0001, "loss": 1.5956, "loss/crossentropy": 2.6675329208374023, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.24016311764717102, "step": 13284 }, { "epoch": 0.19837388662002853, "grad_norm": 0.265625, "grad_norm_var": 0.001296854019165039, "learning_rate": 0.0001, "loss": 1.2796, "loss/crossentropy": 2.5206531286239624, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.1663072556257248, "step": 13285 }, { "epoch": 0.1983888187906435, "grad_norm": 0.30078125, "grad_norm_var": 0.0013074239095052084, "learning_rate": 0.0001, "loss": 1.4745, "loss/crossentropy": 2.577363610267639, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20892395824193954, "step": 13286 }, { "epoch": 0.19840375096125848, "grad_norm": 0.376953125, "grad_norm_var": 0.0015619913736979166, "learning_rate": 0.0001, "loss": 2.0518, "loss/crossentropy": 2.9215065240859985, "loss/fcd": 1.61328125, "loss/idx": 11.0, "loss/logits": 0.4385107755661011, "step": 13287 }, { "epoch": 0.19841868313187347, "grad_norm": 0.287109375, "grad_norm_var": 0.0016156355539957681, "learning_rate": 0.0001, "loss": 1.5034, "loss/crossentropy": 2.6504433155059814, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.23774731159210205, "step": 13288 }, { "epoch": 0.19843361530248846, "grad_norm": 0.267578125, "grad_norm_var": 0.0016390323638916016, "learning_rate": 0.0001, "loss": 1.3351, "loss/crossentropy": 2.5548086166381836, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18279144912958145, "step": 13289 }, { "epoch": 0.19844854747310342, "grad_norm": 0.30078125, "grad_norm_var": 0.0012295881907145182, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.587503671646118, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2303345799446106, "step": 13290 }, { "epoch": 0.1984634796437184, "grad_norm": 0.302734375, "grad_norm_var": 0.0011984348297119141, "learning_rate": 0.0001, "loss": 1.4096, "loss/crossentropy": 2.5971295833587646, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20648466050624847, "step": 13291 }, { "epoch": 0.1984784118143334, "grad_norm": 0.353515625, "grad_norm_var": 0.0013093153635660808, "learning_rate": 0.0001, "loss": 1.5971, "loss/crossentropy": 2.4109020233154297, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.2533482015132904, "step": 13292 }, { "epoch": 0.19849334398494836, "grad_norm": 0.34765625, "grad_norm_var": 0.0013833204905192057, "learning_rate": 0.0001, "loss": 1.5721, "loss/crossentropy": 1.9797710180282593, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.22053905576467514, "step": 13293 }, { "epoch": 0.19850827615556335, "grad_norm": 0.333984375, "grad_norm_var": 0.0011269728342692058, "learning_rate": 0.0001, "loss": 1.4195, "loss/crossentropy": 2.7416810989379883, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19294843077659607, "step": 13294 }, { "epoch": 0.19852320832617834, "grad_norm": 0.30078125, "grad_norm_var": 0.0011246840159098307, "learning_rate": 0.0001, "loss": 1.5966, "loss/crossentropy": 2.6174182891845703, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.2567080706357956, "step": 13295 }, { "epoch": 0.1985381404967933, "grad_norm": 0.291015625, "grad_norm_var": 0.0011583964029947917, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.6360981464385986, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17673557996749878, "step": 13296 }, { "epoch": 0.1985530726674083, "grad_norm": 0.294921875, "grad_norm_var": 0.0011814753214518228, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.575412154197693, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.21348806470632553, "step": 13297 }, { "epoch": 0.19856800483802328, "grad_norm": 0.298828125, "grad_norm_var": 0.0009366194407145182, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.5351345539093018, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23242277652025223, "step": 13298 }, { "epoch": 0.19858293700863827, "grad_norm": 0.287109375, "grad_norm_var": 0.0009668986002604167, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.3708380460739136, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2051134556531906, "step": 13299 }, { "epoch": 0.19859786917925323, "grad_norm": 0.271484375, "grad_norm_var": 0.001019906997680664, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.6626967191696167, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.179251529276371, "step": 13300 }, { "epoch": 0.19861280134986822, "grad_norm": 0.337890625, "grad_norm_var": 0.0009663899739583334, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.3676849603652954, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.20521771907806396, "step": 13301 }, { "epoch": 0.1986277335204832, "grad_norm": 0.392578125, "grad_norm_var": 0.0013854821523030599, "learning_rate": 0.0001, "loss": 1.9754, "loss/crossentropy": 2.549842596054077, "loss/fcd": 1.63671875, "loss/idx": 11.0, "loss/logits": 0.3386466056108475, "step": 13302 }, { "epoch": 0.19864266569109817, "grad_norm": 0.2890625, "grad_norm_var": 0.0011458714803059896, "learning_rate": 0.0001, "loss": 1.434, "loss/crossentropy": 2.885810375213623, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2152639999985695, "step": 13303 }, { "epoch": 0.19865759786171316, "grad_norm": 0.302734375, "grad_norm_var": 0.0011138280232747396, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.704000234603882, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17843157052993774, "step": 13304 }, { "epoch": 0.19867253003232815, "grad_norm": 0.298828125, "grad_norm_var": 0.0009948094685872397, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.4537694454193115, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21067368984222412, "step": 13305 }, { "epoch": 0.19868746220294314, "grad_norm": 0.2734375, "grad_norm_var": 0.0010851542154947916, "learning_rate": 0.0001, "loss": 1.4856, "loss/crossentropy": 2.3330289125442505, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2317129224538803, "step": 13306 }, { "epoch": 0.1987023943735581, "grad_norm": 0.3359375, "grad_norm_var": 0.0011173089345296223, "learning_rate": 0.0001, "loss": 1.5339, "loss/crossentropy": 2.8983876705169678, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2487102895975113, "step": 13307 }, { "epoch": 0.1987173265441731, "grad_norm": 0.4453125, "grad_norm_var": 0.002138519287109375, "learning_rate": 0.0001, "loss": 1.3351, "loss/crossentropy": 2.764520049095154, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17882110178470612, "step": 13308 }, { "epoch": 0.19873225871478808, "grad_norm": 0.32421875, "grad_norm_var": 0.00208282470703125, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.813220739364624, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17903825640678406, "step": 13309 }, { "epoch": 0.19874719088540305, "grad_norm": 0.328125, "grad_norm_var": 0.002072000503540039, "learning_rate": 0.0001, "loss": 1.649, "loss/crossentropy": 2.489962100982666, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.25841011106967926, "step": 13310 }, { "epoch": 0.19876212305601804, "grad_norm": 0.306640625, "grad_norm_var": 0.00206146240234375, "learning_rate": 0.0001, "loss": 1.6691, "loss/crossentropy": 2.3703598976135254, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.313599094748497, "step": 13311 }, { "epoch": 0.19877705522663303, "grad_norm": 0.326171875, "grad_norm_var": 0.0020151138305664062, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.7307082414627075, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.15759174525737762, "step": 13312 }, { "epoch": 0.198791987397248, "grad_norm": 0.3046875, "grad_norm_var": 0.001988967259724935, "learning_rate": 0.0001, "loss": 1.5324, "loss/crossentropy": 2.6365480422973633, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.23550205677747726, "step": 13313 }, { "epoch": 0.19880691956786298, "grad_norm": 0.3671875, "grad_norm_var": 0.0020863215128580728, "learning_rate": 0.0001, "loss": 1.3792, "loss/crossentropy": 2.5700124502182007, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.17994524538516998, "step": 13314 }, { "epoch": 0.19882185173847797, "grad_norm": 0.271484375, "grad_norm_var": 0.002179400126139323, "learning_rate": 0.0001, "loss": 1.3756, "loss/crossentropy": 2.737910509109497, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19985555857419968, "step": 13315 }, { "epoch": 0.19883678390909296, "grad_norm": 0.55078125, "grad_norm_var": 0.0051182905832926435, "learning_rate": 0.0001, "loss": 1.8302, "loss/crossentropy": 2.738781690597534, "loss/fcd": 1.60546875, "loss/idx": 11.0, "loss/logits": 0.2247532159090042, "step": 13316 }, { "epoch": 0.19885171607970792, "grad_norm": 0.306640625, "grad_norm_var": 0.0051920413970947266, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.274332284927368, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17462465912103653, "step": 13317 }, { "epoch": 0.1988666482503229, "grad_norm": 0.345703125, "grad_norm_var": 0.004994440078735352, "learning_rate": 0.0001, "loss": 1.5299, "loss/crossentropy": 2.494638442993164, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2369263917207718, "step": 13318 }, { "epoch": 0.1988815804209379, "grad_norm": 0.35546875, "grad_norm_var": 0.00485393206278483, "learning_rate": 0.0001, "loss": 1.5169, "loss/crossentropy": 2.6451282501220703, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.22787168622016907, "step": 13319 }, { "epoch": 0.19889651259155286, "grad_norm": 0.3515625, "grad_norm_var": 0.004758961995442708, "learning_rate": 0.0001, "loss": 1.6213, "loss/crossentropy": 2.6274231672286987, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.25805409252643585, "step": 13320 }, { "epoch": 0.19891144476216785, "grad_norm": 0.263671875, "grad_norm_var": 0.00504449208577474, "learning_rate": 0.0001, "loss": 1.2274, "loss/crossentropy": 2.5125240087509155, "loss/fcd": 1.078125, "loss/idx": 11.0, "loss/logits": 0.14924510568380356, "step": 13321 }, { "epoch": 0.19892637693278284, "grad_norm": 0.3203125, "grad_norm_var": 0.004759152730305989, "learning_rate": 0.0001, "loss": 1.3995, "loss/crossentropy": 2.7787747383117676, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19638922810554504, "step": 13322 }, { "epoch": 0.19894130910339783, "grad_norm": 0.34765625, "grad_norm_var": 0.004755147298177083, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.573300838470459, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.2561345472931862, "step": 13323 }, { "epoch": 0.1989562412740128, "grad_norm": 0.3984375, "grad_norm_var": 0.004263814290364583, "learning_rate": 0.0001, "loss": 2.0248, "loss/crossentropy": 2.4621936082839966, "loss/fcd": 1.64453125, "loss/idx": 11.0, "loss/logits": 0.38029052317142487, "step": 13324 }, { "epoch": 0.19897117344462778, "grad_norm": 0.267578125, "grad_norm_var": 0.004597075780232747, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.5583871603012085, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17528581619262695, "step": 13325 }, { "epoch": 0.19898610561524277, "grad_norm": 0.32421875, "grad_norm_var": 0.0046033064524332685, "learning_rate": 0.0001, "loss": 1.521, "loss/crossentropy": 2.6913524866104126, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2319706305861473, "step": 13326 }, { "epoch": 0.19900103778585773, "grad_norm": 0.322265625, "grad_norm_var": 0.00455320676167806, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.30329430103302, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19234810024499893, "step": 13327 }, { "epoch": 0.19901596995647272, "grad_norm": 0.31640625, "grad_norm_var": 0.004575856526692708, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.596874952316284, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1895267590880394, "step": 13328 }, { "epoch": 0.1990309021270877, "grad_norm": 0.341796875, "grad_norm_var": 0.00449522336324056, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.675814986228943, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19162072241306305, "step": 13329 }, { "epoch": 0.19904583429770267, "grad_norm": 0.27734375, "grad_norm_var": 0.004682397842407227, "learning_rate": 0.0001, "loss": 1.262, "loss/crossentropy": 2.458872437477112, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.15657660365104675, "step": 13330 }, { "epoch": 0.19906076646831766, "grad_norm": 0.287109375, "grad_norm_var": 0.0045651594797770185, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.5730026960372925, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21362043917179108, "step": 13331 }, { "epoch": 0.19907569863893265, "grad_norm": 0.3046875, "grad_norm_var": 0.0013047377268473306, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.4921809434890747, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18632403016090393, "step": 13332 }, { "epoch": 0.19909063080954764, "grad_norm": 0.287109375, "grad_norm_var": 0.0013651371002197266, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.6981037855148315, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19059638679027557, "step": 13333 }, { "epoch": 0.1991055629801626, "grad_norm": 0.32421875, "grad_norm_var": 0.0013188044230143228, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.4583674669265747, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21151554584503174, "step": 13334 }, { "epoch": 0.1991204951507776, "grad_norm": 0.353515625, "grad_norm_var": 0.0013093153635660808, "learning_rate": 0.0001, "loss": 1.5393, "loss/crossentropy": 2.430943727493286, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.23850193619728088, "step": 13335 }, { "epoch": 0.19913542732139258, "grad_norm": 0.30078125, "grad_norm_var": 0.0012431939442952473, "learning_rate": 0.0001, "loss": 1.5334, "loss/crossentropy": 2.5844842195510864, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.24038460105657578, "step": 13336 }, { "epoch": 0.19915035949200754, "grad_norm": 0.2890625, "grad_norm_var": 0.0011103312174479167, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.719287872314453, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18979760259389877, "step": 13337 }, { "epoch": 0.19916529166262253, "grad_norm": 0.302734375, "grad_norm_var": 0.0011204878489176433, "learning_rate": 0.0001, "loss": 1.3867, "loss/crossentropy": 2.3826332092285156, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.195296511054039, "step": 13338 }, { "epoch": 0.19918022383323752, "grad_norm": 0.28515625, "grad_norm_var": 0.0010950565338134766, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.677780866622925, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20273350179195404, "step": 13339 }, { "epoch": 0.1991951560038525, "grad_norm": 0.3046875, "grad_norm_var": 0.0005564212799072266, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.6079611778259277, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.22124642878770828, "step": 13340 }, { "epoch": 0.19921008817446748, "grad_norm": 0.275390625, "grad_norm_var": 0.0005206902821858724, "learning_rate": 0.0001, "loss": 1.4836, "loss/crossentropy": 2.5560734272003174, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2179476022720337, "step": 13341 }, { "epoch": 0.19922502034508247, "grad_norm": 0.328125, "grad_norm_var": 0.0005311171213785807, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.440353751182556, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.22592265903949738, "step": 13342 }, { "epoch": 0.19923995251569745, "grad_norm": 0.3203125, "grad_norm_var": 0.000527191162109375, "learning_rate": 0.0001, "loss": 1.5651, "loss/crossentropy": 2.41437029838562, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2409181445837021, "step": 13343 }, { "epoch": 0.19925488468631242, "grad_norm": 0.2734375, "grad_norm_var": 0.0005838394165039063, "learning_rate": 0.0001, "loss": 1.3243, "loss/crossentropy": 2.7510424852371216, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1876109093427658, "step": 13344 }, { "epoch": 0.1992698168569274, "grad_norm": 0.296875, "grad_norm_var": 0.0004803816477457682, "learning_rate": 0.0001, "loss": 1.4447, "loss/crossentropy": 2.4687047004699707, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2142430543899536, "step": 13345 }, { "epoch": 0.1992847490275424, "grad_norm": 0.28515625, "grad_norm_var": 0.00045990943908691406, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.6209434270858765, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.15445218980312347, "step": 13346 }, { "epoch": 0.19929968119815736, "grad_norm": 0.27734375, "grad_norm_var": 0.00048414866129557293, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.8746895790100098, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1800057366490364, "step": 13347 }, { "epoch": 0.19931461336877235, "grad_norm": 0.29296875, "grad_norm_var": 0.0004862467447916667, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.4281539916992188, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2087155431509018, "step": 13348 }, { "epoch": 0.19932954553938734, "grad_norm": 0.322265625, "grad_norm_var": 0.0005039850870768229, "learning_rate": 0.0001, "loss": 1.5066, "loss/crossentropy": 2.7912888526916504, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22146548330783844, "step": 13349 }, { "epoch": 0.19934447771000233, "grad_norm": 0.333984375, "grad_norm_var": 0.0005388736724853515, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.4724783897399902, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17004980146884918, "step": 13350 }, { "epoch": 0.1993594098806173, "grad_norm": 0.275390625, "grad_norm_var": 0.0003901004791259766, "learning_rate": 0.0001, "loss": 1.4846, "loss/crossentropy": 2.3971747159957886, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.23069938272237778, "step": 13351 }, { "epoch": 0.19937434205123228, "grad_norm": 0.28515625, "grad_norm_var": 0.0003990014394124349, "learning_rate": 0.0001, "loss": 1.3444, "loss/crossentropy": 2.294656753540039, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17644548416137695, "step": 13352 }, { "epoch": 0.19938927422184727, "grad_norm": 0.255859375, "grad_norm_var": 0.0005019505818684896, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.451422095298767, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17872552573680878, "step": 13353 }, { "epoch": 0.19940420639246223, "grad_norm": 0.30859375, "grad_norm_var": 0.0005103905995686848, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.7009295225143433, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1903492584824562, "step": 13354 }, { "epoch": 0.19941913856307722, "grad_norm": 0.2734375, "grad_norm_var": 0.0005344231923421224, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.51647412776947, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1970798373222351, "step": 13355 }, { "epoch": 0.1994340707336922, "grad_norm": 0.3125, "grad_norm_var": 0.0005490461985270183, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.535524010658264, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21077340841293335, "step": 13356 }, { "epoch": 0.19944900290430717, "grad_norm": 0.322265625, "grad_norm_var": 0.0005650679270426432, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.877856731414795, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17837173491716385, "step": 13357 }, { "epoch": 0.19946393507492216, "grad_norm": 0.271484375, "grad_norm_var": 0.000536028544108073, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.7089685201644897, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20841452479362488, "step": 13358 }, { "epoch": 0.19947886724553715, "grad_norm": 0.322265625, "grad_norm_var": 0.0005430698394775391, "learning_rate": 0.0001, "loss": 1.2686, "loss/crossentropy": 2.5147584676742554, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.16312694549560547, "step": 13359 }, { "epoch": 0.19949379941615214, "grad_norm": 0.296875, "grad_norm_var": 0.0005121707916259765, "learning_rate": 0.0001, "loss": 1.3735, "loss/crossentropy": 2.496360182762146, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18205617368221283, "step": 13360 }, { "epoch": 0.1995087315867671, "grad_norm": 0.27734375, "grad_norm_var": 0.000533151626586914, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.3719611167907715, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17947089672088623, "step": 13361 }, { "epoch": 0.1995236637573821, "grad_norm": 0.47265625, "grad_norm_var": 0.002495431900024414, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.3214714527130127, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.24600719660520554, "step": 13362 }, { "epoch": 0.19953859592799708, "grad_norm": 0.435546875, "grad_norm_var": 0.0034494400024414062, "learning_rate": 0.0001, "loss": 1.5571, "loss/crossentropy": 2.5151153802871704, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.24463516473770142, "step": 13363 }, { "epoch": 0.19955352809861204, "grad_norm": 0.287109375, "grad_norm_var": 0.003469705581665039, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.2901793718338013, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20879990607500076, "step": 13364 }, { "epoch": 0.19956846026922703, "grad_norm": 0.283203125, "grad_norm_var": 0.003531376520792643, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.584269642829895, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18294858932495117, "step": 13365 }, { "epoch": 0.19958339243984202, "grad_norm": 0.310546875, "grad_norm_var": 0.0035012404123942057, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.424952983856201, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.1883842572569847, "step": 13366 }, { "epoch": 0.199598324610457, "grad_norm": 0.29296875, "grad_norm_var": 0.0034350077311197918, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.6445353031158447, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19944269210100174, "step": 13367 }, { "epoch": 0.19961325678107197, "grad_norm": 0.3046875, "grad_norm_var": 0.003386370340983073, "learning_rate": 0.0001, "loss": 1.4341, "loss/crossentropy": 2.6937782764434814, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19973182678222656, "step": 13368 }, { "epoch": 0.19962818895168696, "grad_norm": 0.36328125, "grad_norm_var": 0.0032718499501546224, "learning_rate": 0.0001, "loss": 1.6271, "loss/crossentropy": 2.517315626144409, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.22470656782388687, "step": 13369 }, { "epoch": 0.19964312112230195, "grad_norm": 0.357421875, "grad_norm_var": 0.003340593973795573, "learning_rate": 0.0001, "loss": 1.5078, "loss/crossentropy": 2.7265501022338867, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21096594631671906, "step": 13370 }, { "epoch": 0.19965805329291691, "grad_norm": 0.330078125, "grad_norm_var": 0.0031594435373942056, "learning_rate": 0.0001, "loss": 1.387, "loss/crossentropy": 2.5658769607543945, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19164231419563293, "step": 13371 }, { "epoch": 0.1996729854635319, "grad_norm": 0.337890625, "grad_norm_var": 0.0031489054361979168, "learning_rate": 0.0001, "loss": 1.5906, "loss/crossentropy": 2.467535972595215, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2272966131567955, "step": 13372 }, { "epoch": 0.1996879176341469, "grad_norm": 0.310546875, "grad_norm_var": 0.003168169657389323, "learning_rate": 0.0001, "loss": 1.4623, "loss/crossentropy": 2.4127042293548584, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.212293803691864, "step": 13373 }, { "epoch": 0.19970284980476186, "grad_norm": 0.34375, "grad_norm_var": 0.00294645627339681, "learning_rate": 0.0001, "loss": 1.7469, "loss/crossentropy": 2.501283288002014, "loss/fcd": 1.45703125, "loss/idx": 11.0, "loss/logits": 0.2898639142513275, "step": 13374 }, { "epoch": 0.19971778197537685, "grad_norm": 0.302734375, "grad_norm_var": 0.0029979546864827475, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.617287516593933, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2234026938676834, "step": 13375 }, { "epoch": 0.19973271414599184, "grad_norm": 0.30859375, "grad_norm_var": 0.0029521783192952475, "learning_rate": 0.0001, "loss": 1.4206, "loss/crossentropy": 2.4887609481811523, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.18228860944509506, "step": 13376 }, { "epoch": 0.19974764631660683, "grad_norm": 0.302734375, "grad_norm_var": 0.00280609130859375, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.5244412422180176, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1756245344877243, "step": 13377 }, { "epoch": 0.1997625784872218, "grad_norm": 0.2734375, "grad_norm_var": 0.0016031265258789062, "learning_rate": 0.0001, "loss": 1.491, "loss/crossentropy": 2.5646798610687256, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22147341072559357, "step": 13378 }, { "epoch": 0.19977751065783678, "grad_norm": 0.33984375, "grad_norm_var": 0.0007207075754801433, "learning_rate": 0.0001, "loss": 1.5014, "loss/crossentropy": 2.7894786596298218, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22013768553733826, "step": 13379 }, { "epoch": 0.19979244282845177, "grad_norm": 0.248046875, "grad_norm_var": 0.000964212417602539, "learning_rate": 0.0001, "loss": 1.2527, "loss/crossentropy": 2.481385827064514, "loss/fcd": 1.08984375, "loss/idx": 11.0, "loss/logits": 0.16288329660892487, "step": 13380 }, { "epoch": 0.19980737499906673, "grad_norm": 0.306640625, "grad_norm_var": 0.0009050846099853515, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.501286506652832, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19250217080116272, "step": 13381 }, { "epoch": 0.19982230716968172, "grad_norm": 0.296875, "grad_norm_var": 0.0009241104125976562, "learning_rate": 0.0001, "loss": 1.3963, "loss/crossentropy": 2.6251991987228394, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20094036310911179, "step": 13382 }, { "epoch": 0.1998372393402967, "grad_norm": 0.2890625, "grad_norm_var": 0.0009358723958333334, "learning_rate": 0.0001, "loss": 1.3502, "loss/crossentropy": 2.6510077714920044, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18618252873420715, "step": 13383 }, { "epoch": 0.1998521715109117, "grad_norm": 0.3046875, "grad_norm_var": 0.0009358723958333334, "learning_rate": 0.0001, "loss": 1.4728, "loss/crossentropy": 2.571448564529419, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19932330399751663, "step": 13384 }, { "epoch": 0.19986710368152666, "grad_norm": 0.32421875, "grad_norm_var": 0.0007718404134114584, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.841663122177124, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1925484985113144, "step": 13385 }, { "epoch": 0.19988203585214165, "grad_norm": 0.27734375, "grad_norm_var": 0.0006773471832275391, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.566467761993408, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1894855499267578, "step": 13386 }, { "epoch": 0.19989696802275664, "grad_norm": 0.3203125, "grad_norm_var": 0.0006519953409830729, "learning_rate": 0.0001, "loss": 1.6634, "loss/crossentropy": 3.0664525032043457, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.2767021358013153, "step": 13387 }, { "epoch": 0.1999119001933716, "grad_norm": 0.27734375, "grad_norm_var": 0.0006189823150634765, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.7822060585021973, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19386331737041473, "step": 13388 }, { "epoch": 0.1999268323639866, "grad_norm": 0.283203125, "grad_norm_var": 0.0006332238515218099, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.4169472455978394, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.20577199757099152, "step": 13389 }, { "epoch": 0.19994176453460158, "grad_norm": 0.302734375, "grad_norm_var": 0.0004987080891927083, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.471382737159729, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20968805253505707, "step": 13390 }, { "epoch": 0.19995669670521654, "grad_norm": 0.26953125, "grad_norm_var": 0.0005438327789306641, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.4472174644470215, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.16519934684038162, "step": 13391 }, { "epoch": 0.19997162887583153, "grad_norm": 0.3125, "grad_norm_var": 0.0005517164866129557, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.578226685523987, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1798759549856186, "step": 13392 }, { "epoch": 0.19998656104644652, "grad_norm": 0.298828125, "grad_norm_var": 0.0005489190419514974, "learning_rate": 0.0001, "loss": 1.6054, "loss/crossentropy": 2.474398374557495, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.25770725309848785, "step": 13393 }, { "epoch": 0.2000014932170615, "grad_norm": 0.30859375, "grad_norm_var": 0.0005237420399983723, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.513760805130005, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20552873611450195, "step": 13394 }, { "epoch": 0.20001642538767647, "grad_norm": 0.265625, "grad_norm_var": 0.00044884681701660154, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.6416287422180176, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1822429895401001, "step": 13395 }, { "epoch": 0.20003135755829146, "grad_norm": 0.3046875, "grad_norm_var": 0.0003110249837239583, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.656986951828003, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20161378383636475, "step": 13396 }, { "epoch": 0.20004628972890645, "grad_norm": 0.263671875, "grad_norm_var": 0.0003676732381184896, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.5794448852539062, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.22248279303312302, "step": 13397 }, { "epoch": 0.2000612218995214, "grad_norm": 0.3046875, "grad_norm_var": 0.00037479400634765625, "learning_rate": 0.0001, "loss": 1.5497, "loss/crossentropy": 2.3288278579711914, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.22942717373371124, "step": 13398 }, { "epoch": 0.2000761540701364, "grad_norm": 0.314453125, "grad_norm_var": 0.0003977298736572266, "learning_rate": 0.0001, "loss": 1.3886, "loss/crossentropy": 2.495702385902405, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.16590885818004608, "step": 13399 }, { "epoch": 0.2000910862407514, "grad_norm": 0.279296875, "grad_norm_var": 0.00040785471598307293, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.5981154441833496, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19704793393611908, "step": 13400 }, { "epoch": 0.20010601841136638, "grad_norm": 0.38671875, "grad_norm_var": 0.000902239481608073, "learning_rate": 0.0001, "loss": 1.5624, "loss/crossentropy": 2.4396865367889404, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.24604037404060364, "step": 13401 }, { "epoch": 0.20012095058198134, "grad_norm": 0.263671875, "grad_norm_var": 0.0009517510732014974, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.5498706102371216, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22890978306531906, "step": 13402 }, { "epoch": 0.20013588275259633, "grad_norm": 0.330078125, "grad_norm_var": 0.0009877522786458333, "learning_rate": 0.0001, "loss": 1.4536, "loss/crossentropy": 2.5469435453414917, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2035670280456543, "step": 13403 }, { "epoch": 0.20015081492321132, "grad_norm": 0.2890625, "grad_norm_var": 0.0009642918904622396, "learning_rate": 0.0001, "loss": 1.4491, "loss/crossentropy": 2.6766048669815063, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2068849578499794, "step": 13404 }, { "epoch": 0.20016574709382629, "grad_norm": 0.3046875, "grad_norm_var": 0.0009490807851155598, "learning_rate": 0.0001, "loss": 1.2712, "loss/crossentropy": 2.5646305084228516, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.16569313406944275, "step": 13405 }, { "epoch": 0.20018067926444127, "grad_norm": 0.255859375, "grad_norm_var": 0.0010688622792561849, "learning_rate": 0.0001, "loss": 1.2653, "loss/crossentropy": 2.5965917110443115, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16761905699968338, "step": 13406 }, { "epoch": 0.20019561143505626, "grad_norm": 0.265625, "grad_norm_var": 0.0010841210683186849, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.621206283569336, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19511184096336365, "step": 13407 }, { "epoch": 0.20021054360567123, "grad_norm": 0.265625, "grad_norm_var": 0.0011230309804280598, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.659753203392029, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18803869932889938, "step": 13408 }, { "epoch": 0.20022547577628622, "grad_norm": 0.2734375, "grad_norm_var": 0.0011463801066080729, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.6019457578659058, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1924302652478218, "step": 13409 }, { "epoch": 0.2002404079469012, "grad_norm": 0.33984375, "grad_norm_var": 0.0012755711873372396, "learning_rate": 0.0001, "loss": 1.559, "loss/crossentropy": 2.6275845766067505, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.24261432886123657, "step": 13410 }, { "epoch": 0.2002553401175162, "grad_norm": 0.275390625, "grad_norm_var": 0.0012443383534749349, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.3350131511688232, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1738826036453247, "step": 13411 }, { "epoch": 0.20027027228813116, "grad_norm": 0.35546875, "grad_norm_var": 0.00147245724995931, "learning_rate": 0.0001, "loss": 1.6083, "loss/crossentropy": 2.603040099143982, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2606435641646385, "step": 13412 }, { "epoch": 0.20028520445874615, "grad_norm": 0.37109375, "grad_norm_var": 0.0017023722330729166, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.7152355909347534, "loss/fcd": 1.35546875, "loss/idx": 11.0, "loss/logits": 0.21881777048110962, "step": 13413 }, { "epoch": 0.20030013662936114, "grad_norm": 0.310546875, "grad_norm_var": 0.0017045180002848307, "learning_rate": 0.0001, "loss": 1.4928, "loss/crossentropy": 2.736783027648926, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22331532090902328, "step": 13414 }, { "epoch": 0.2003150687999761, "grad_norm": 0.333984375, "grad_norm_var": 0.0017528374989827474, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.3921979665756226, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.1910332590341568, "step": 13415 }, { "epoch": 0.2003300009705911, "grad_norm": 0.2734375, "grad_norm_var": 0.0017760594685872395, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.611263155937195, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18587443977594376, "step": 13416 }, { "epoch": 0.20034493314120608, "grad_norm": 0.30859375, "grad_norm_var": 0.001315752665201823, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.4833818674087524, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20694077759981155, "step": 13417 }, { "epoch": 0.20035986531182104, "grad_norm": 0.291015625, "grad_norm_var": 0.0012262980143229166, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.679147720336914, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19077667593955994, "step": 13418 }, { "epoch": 0.20037479748243603, "grad_norm": 0.32421875, "grad_norm_var": 0.0012070814768473308, "learning_rate": 0.0001, "loss": 1.4442, "loss/crossentropy": 2.854143977165222, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21372362226247787, "step": 13419 }, { "epoch": 0.20038972965305102, "grad_norm": 0.2890625, "grad_norm_var": 0.0012070814768473308, "learning_rate": 0.0001, "loss": 1.2946, "loss/crossentropy": 2.5470272302627563, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16180521249771118, "step": 13420 }, { "epoch": 0.200404661823666, "grad_norm": 0.298828125, "grad_norm_var": 0.0012074152628580729, "learning_rate": 0.0001, "loss": 1.4546, "loss/crossentropy": 2.597904682159424, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2163533940911293, "step": 13421 }, { "epoch": 0.20041959399428097, "grad_norm": 0.275390625, "grad_norm_var": 0.0011110941569010416, "learning_rate": 0.0001, "loss": 1.3608, "loss/crossentropy": 2.664372682571411, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18895476311445236, "step": 13422 }, { "epoch": 0.20043452616489596, "grad_norm": 0.314453125, "grad_norm_var": 0.0010153293609619141, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.655332326889038, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21783015131950378, "step": 13423 }, { "epoch": 0.20044945833551095, "grad_norm": 0.30859375, "grad_norm_var": 0.0008978366851806641, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.4777663946151733, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2268693447113037, "step": 13424 }, { "epoch": 0.2004643905061259, "grad_norm": 0.279296875, "grad_norm_var": 0.0008722305297851563, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.8131428956985474, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18378746509552002, "step": 13425 }, { "epoch": 0.2004793226767409, "grad_norm": 0.333984375, "grad_norm_var": 0.0008505344390869141, "learning_rate": 0.0001, "loss": 1.5262, "loss/crossentropy": 2.4472451210021973, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.22155866771936417, "step": 13426 }, { "epoch": 0.2004942548473559, "grad_norm": 0.34765625, "grad_norm_var": 0.0008534749348958333, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.6155765056610107, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1809171885251999, "step": 13427 }, { "epoch": 0.20050918701797088, "grad_norm": 0.30078125, "grad_norm_var": 0.0007342020670572917, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.5880818367004395, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17973143607378006, "step": 13428 }, { "epoch": 0.20052411918858584, "grad_norm": 0.28515625, "grad_norm_var": 0.0004964192708333333, "learning_rate": 0.0001, "loss": 1.2815, "loss/crossentropy": 2.4589675664901733, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.16044866293668747, "step": 13429 }, { "epoch": 0.20053905135920083, "grad_norm": 0.302734375, "grad_norm_var": 0.0004941304524739583, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.7263723611831665, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2068488448858261, "step": 13430 }, { "epoch": 0.20055398352981582, "grad_norm": 0.322265625, "grad_norm_var": 0.00045617421468098957, "learning_rate": 0.0001, "loss": 1.3827, "loss/crossentropy": 2.6005773544311523, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19132810831069946, "step": 13431 }, { "epoch": 0.20056891570043078, "grad_norm": 0.291015625, "grad_norm_var": 0.0004051049550374349, "learning_rate": 0.0001, "loss": 1.3959, "loss/crossentropy": 2.5894858837127686, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.21234238147735596, "step": 13432 }, { "epoch": 0.20058384787104577, "grad_norm": 0.326171875, "grad_norm_var": 0.00043385823567708334, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.5757956504821777, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1974225789308548, "step": 13433 }, { "epoch": 0.20059878004166076, "grad_norm": 0.255859375, "grad_norm_var": 0.0005797704060872395, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.4736427068710327, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20101334899663925, "step": 13434 }, { "epoch": 0.20061371221227572, "grad_norm": 0.5546875, "grad_norm_var": 0.004537200927734375, "learning_rate": 0.0001, "loss": 1.754, "loss/crossentropy": 3.0904109477996826, "loss/fcd": 1.48046875, "loss/idx": 11.0, "loss/logits": 0.2735501527786255, "step": 13435 }, { "epoch": 0.20062864438289071, "grad_norm": 0.3828125, "grad_norm_var": 0.004726409912109375, "learning_rate": 0.0001, "loss": 1.4849, "loss/crossentropy": 2.688617467880249, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.21537432074546814, "step": 13436 }, { "epoch": 0.2006435765535057, "grad_norm": 0.267578125, "grad_norm_var": 0.004891204833984375, "learning_rate": 0.0001, "loss": 1.4302, "loss/crossentropy": 2.3990169763565063, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19972597807645798, "step": 13437 }, { "epoch": 0.2006585087241207, "grad_norm": 0.328125, "grad_norm_var": 0.004738855361938477, "learning_rate": 0.0001, "loss": 1.5444, "loss/crossentropy": 2.5311055183410645, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2358444407582283, "step": 13438 }, { "epoch": 0.20067344089473566, "grad_norm": 0.302734375, "grad_norm_var": 0.004764032363891601, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.484874963760376, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.20419929176568985, "step": 13439 }, { "epoch": 0.20068837306535064, "grad_norm": 0.283203125, "grad_norm_var": 0.004857635498046875, "learning_rate": 0.0001, "loss": 1.3066, "loss/crossentropy": 2.434152126312256, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1777421161532402, "step": 13440 }, { "epoch": 0.20070330523596563, "grad_norm": 0.287109375, "grad_norm_var": 0.004816182454427083, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.4198837280273438, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21913592517375946, "step": 13441 }, { "epoch": 0.2007182374065806, "grad_norm": 0.30078125, "grad_norm_var": 0.004837528864542643, "learning_rate": 0.0001, "loss": 1.2401, "loss/crossentropy": 2.6290037631988525, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.15415211766958237, "step": 13442 }, { "epoch": 0.20073316957719559, "grad_norm": 0.27734375, "grad_norm_var": 0.004898182551066081, "learning_rate": 0.0001, "loss": 1.351, "loss/crossentropy": 2.624363422393799, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18689418584108353, "step": 13443 }, { "epoch": 0.20074810174781058, "grad_norm": 0.37109375, "grad_norm_var": 0.0050572554270426435, "learning_rate": 0.0001, "loss": 1.6015, "loss/crossentropy": 2.6605327129364014, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.249905064702034, "step": 13444 }, { "epoch": 0.20076303391842557, "grad_norm": 0.2734375, "grad_norm_var": 0.0051221052805582685, "learning_rate": 0.0001, "loss": 1.2991, "loss/crossentropy": 2.567090153694153, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1545753851532936, "step": 13445 }, { "epoch": 0.20077796608904053, "grad_norm": 0.259765625, "grad_norm_var": 0.005338907241821289, "learning_rate": 0.0001, "loss": 1.4623, "loss/crossentropy": 2.245883584022522, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22404314577579498, "step": 13446 }, { "epoch": 0.20079289825965552, "grad_norm": 0.3359375, "grad_norm_var": 0.0053588231404622395, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.657876133918762, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.19298900663852692, "step": 13447 }, { "epoch": 0.2008078304302705, "grad_norm": 0.294921875, "grad_norm_var": 0.005345408121744792, "learning_rate": 0.0001, "loss": 1.3459, "loss/crossentropy": 2.513680934906006, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.16617582738399506, "step": 13448 }, { "epoch": 0.20082276260088547, "grad_norm": 0.322265625, "grad_norm_var": 0.005342547098795573, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.6823983192443848, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.23160391300916672, "step": 13449 }, { "epoch": 0.20083769477150046, "grad_norm": 0.28515625, "grad_norm_var": 0.005151096979777018, "learning_rate": 0.0001, "loss": 1.34, "loss/crossentropy": 2.5286078453063965, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17200888693332672, "step": 13450 }, { "epoch": 0.20085262694211545, "grad_norm": 0.322265625, "grad_norm_var": 0.0012679417928059896, "learning_rate": 0.0001, "loss": 1.5749, "loss/crossentropy": 2.642503023147583, "loss/fcd": 1.3203125, "loss/idx": 11.0, "loss/logits": 0.2545819357037544, "step": 13451 }, { "epoch": 0.2008675591127304, "grad_norm": 0.28515625, "grad_norm_var": 0.0008626302083333333, "learning_rate": 0.0001, "loss": 1.4074, "loss/crossentropy": 2.647232174873352, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20817923545837402, "step": 13452 }, { "epoch": 0.2008824912833454, "grad_norm": 0.33984375, "grad_norm_var": 0.0008785088857014973, "learning_rate": 0.0001, "loss": 1.5951, "loss/crossentropy": 2.616459608078003, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.23568806052207947, "step": 13453 }, { "epoch": 0.2008974234539604, "grad_norm": 0.296875, "grad_norm_var": 0.0008403619130452473, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.654772996902466, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18510349094867706, "step": 13454 }, { "epoch": 0.20091235562457538, "grad_norm": 0.32421875, "grad_norm_var": 0.0008702596028645833, "learning_rate": 0.0001, "loss": 1.4891, "loss/crossentropy": 2.472112774848938, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2039795145392418, "step": 13455 }, { "epoch": 0.20092728779519034, "grad_norm": 0.296875, "grad_norm_var": 0.0008445580800374349, "learning_rate": 0.0001, "loss": 1.4808, "loss/crossentropy": 2.544337749481201, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.22686459124088287, "step": 13456 }, { "epoch": 0.20094221996580533, "grad_norm": 0.28125, "grad_norm_var": 0.0008603413899739583, "learning_rate": 0.0001, "loss": 1.4274, "loss/crossentropy": 2.587520122528076, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20867999643087387, "step": 13457 }, { "epoch": 0.20095715213642032, "grad_norm": 0.265625, "grad_norm_var": 0.0009536107381184895, "learning_rate": 0.0001, "loss": 1.306, "loss/crossentropy": 2.5956863164901733, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16927683353424072, "step": 13458 }, { "epoch": 0.20097208430703528, "grad_norm": 0.263671875, "grad_norm_var": 0.0010102430979410807, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.461369514465332, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1774565875530243, "step": 13459 }, { "epoch": 0.20098701647765027, "grad_norm": 0.271484375, "grad_norm_var": 0.0007013956705729167, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.598204255104065, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.19351106137037277, "step": 13460 }, { "epoch": 0.20100194864826526, "grad_norm": 0.28515625, "grad_norm_var": 0.0006764094034830729, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.631948709487915, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.18236084282398224, "step": 13461 }, { "epoch": 0.20101688081888025, "grad_norm": 0.291015625, "grad_norm_var": 0.0005879084269205729, "learning_rate": 0.0001, "loss": 1.3335, "loss/crossentropy": 2.8409186601638794, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.19286762923002243, "step": 13462 }, { "epoch": 0.2010318129894952, "grad_norm": 0.298828125, "grad_norm_var": 0.0004843235015869141, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.595834255218506, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20140957832336426, "step": 13463 }, { "epoch": 0.2010467451601102, "grad_norm": 0.5078125, "grad_norm_var": 0.00330657958984375, "learning_rate": 0.0001, "loss": 1.5887, "loss/crossentropy": 2.8719985485076904, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.25671616196632385, "step": 13464 }, { "epoch": 0.2010616773307252, "grad_norm": 0.318359375, "grad_norm_var": 0.0033004124959309894, "learning_rate": 0.0001, "loss": 1.3677, "loss/crossentropy": 2.5993953943252563, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19190499186515808, "step": 13465 }, { "epoch": 0.20107660950134015, "grad_norm": 0.326171875, "grad_norm_var": 0.003278716405232747, "learning_rate": 0.0001, "loss": 1.4899, "loss/crossentropy": 2.416306495666504, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.1969747170805931, "step": 13466 }, { "epoch": 0.20109154167195514, "grad_norm": 0.279296875, "grad_norm_var": 0.0033290704091389974, "learning_rate": 0.0001, "loss": 1.297, "loss/crossentropy": 2.6151092052459717, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1680838018655777, "step": 13467 }, { "epoch": 0.20110647384257013, "grad_norm": 0.3125, "grad_norm_var": 0.0032916863759358725, "learning_rate": 0.0001, "loss": 1.2485, "loss/crossentropy": 2.6791563034057617, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.17426229268312454, "step": 13468 }, { "epoch": 0.2011214060131851, "grad_norm": 0.341796875, "grad_norm_var": 0.003299713134765625, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.4369174242019653, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20212258398532867, "step": 13469 }, { "epoch": 0.20113633818380008, "grad_norm": 0.3828125, "grad_norm_var": 0.0036102294921875, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.5869059562683105, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20610682666301727, "step": 13470 }, { "epoch": 0.20115127035441507, "grad_norm": 0.3671875, "grad_norm_var": 0.0037759780883789063, "learning_rate": 0.0001, "loss": 1.8162, "loss/crossentropy": 2.602267265319824, "loss/fcd": 1.5234375, "loss/idx": 11.0, "loss/logits": 0.29277747869491577, "step": 13471 }, { "epoch": 0.20116620252503006, "grad_norm": 0.27734375, "grad_norm_var": 0.003855133056640625, "learning_rate": 0.0001, "loss": 1.3457, "loss/crossentropy": 2.7352027893066406, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.18558473885059357, "step": 13472 }, { "epoch": 0.20118113469564503, "grad_norm": 0.310546875, "grad_norm_var": 0.003769540786743164, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.6412633657455444, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2076658457517624, "step": 13473 }, { "epoch": 0.20119606686626001, "grad_norm": 0.3046875, "grad_norm_var": 0.0035883426666259766, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.89803147315979, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18875758349895477, "step": 13474 }, { "epoch": 0.201210999036875, "grad_norm": 0.376953125, "grad_norm_var": 0.0035219669342041017, "learning_rate": 0.0001, "loss": 1.494, "loss/crossentropy": 2.601072311401367, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.19319774210453033, "step": 13475 }, { "epoch": 0.20122593120748997, "grad_norm": 0.3359375, "grad_norm_var": 0.0032938003540039064, "learning_rate": 0.0001, "loss": 1.5222, "loss/crossentropy": 2.426332712173462, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.21359678357839584, "step": 13476 }, { "epoch": 0.20124086337810496, "grad_norm": 0.26953125, "grad_norm_var": 0.0034072240193684894, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.5446293354034424, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1792844757437706, "step": 13477 }, { "epoch": 0.20125579554871995, "grad_norm": 0.2890625, "grad_norm_var": 0.0034179528554280597, "learning_rate": 0.0001, "loss": 1.3957, "loss/crossentropy": 2.7733891010284424, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19653048366308212, "step": 13478 }, { "epoch": 0.2012707277193349, "grad_norm": 0.296875, "grad_norm_var": 0.0034266153971354166, "learning_rate": 0.0001, "loss": 1.5197, "loss/crossentropy": 2.8814401626586914, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2306026592850685, "step": 13479 }, { "epoch": 0.2012856598899499, "grad_norm": 0.298828125, "grad_norm_var": 0.0012309869130452474, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.3054451942443848, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.23536384105682373, "step": 13480 }, { "epoch": 0.2013005920605649, "grad_norm": 0.296875, "grad_norm_var": 0.0012587865193684896, "learning_rate": 0.0001, "loss": 1.4218, "loss/crossentropy": 2.6686851978302, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20695245265960693, "step": 13481 }, { "epoch": 0.20131552423117988, "grad_norm": 0.33203125, "grad_norm_var": 0.0012683709462483724, "learning_rate": 0.0001, "loss": 1.5485, "loss/crossentropy": 2.415360927581787, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.21647147834300995, "step": 13482 }, { "epoch": 0.20133045640179484, "grad_norm": 0.283203125, "grad_norm_var": 0.00124967892964681, "learning_rate": 0.0001, "loss": 1.413, "loss/crossentropy": 2.5566630363464355, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1981818601489067, "step": 13483 }, { "epoch": 0.20134538857240983, "grad_norm": 0.302734375, "grad_norm_var": 0.0012618382771809897, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.416139841079712, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22044000774621964, "step": 13484 }, { "epoch": 0.20136032074302482, "grad_norm": 0.361328125, "grad_norm_var": 0.001351165771484375, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.5695390701293945, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.1881905049085617, "step": 13485 }, { "epoch": 0.20137525291363978, "grad_norm": 0.29296875, "grad_norm_var": 0.0010777155558268229, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.668240547180176, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20187965780496597, "step": 13486 }, { "epoch": 0.20139018508425477, "grad_norm": 0.421875, "grad_norm_var": 0.0016651789347330729, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.588745594024658, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19818812608718872, "step": 13487 }, { "epoch": 0.20140511725486976, "grad_norm": 0.34765625, "grad_norm_var": 0.001614824930826823, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.3691486120224, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.26822496205568314, "step": 13488 }, { "epoch": 0.20142004942548475, "grad_norm": 0.259765625, "grad_norm_var": 0.0018404642740885416, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.571005940437317, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17917899787425995, "step": 13489 }, { "epoch": 0.2014349815960997, "grad_norm": 0.298828125, "grad_norm_var": 0.0018521467844645181, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.4234917163848877, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.24873086810112, "step": 13490 }, { "epoch": 0.2014499137667147, "grad_norm": 0.283203125, "grad_norm_var": 0.0016461531321207681, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.5022248029708862, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18674815446138382, "step": 13491 }, { "epoch": 0.2014648459373297, "grad_norm": 0.3125, "grad_norm_var": 0.0016015211741129557, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.4180409908294678, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18181003630161285, "step": 13492 }, { "epoch": 0.20147977810794465, "grad_norm": 0.3515625, "grad_norm_var": 0.0015881697336832682, "learning_rate": 0.0001, "loss": 1.5255, "loss/crossentropy": 2.649177670478821, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.23639801144599915, "step": 13493 }, { "epoch": 0.20149471027855964, "grad_norm": 0.283203125, "grad_norm_var": 0.0016100565592447917, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.6982697248458862, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18973815441131592, "step": 13494 }, { "epoch": 0.20150964244917463, "grad_norm": 0.310546875, "grad_norm_var": 0.001590585708618164, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.591425657272339, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.17747823148965836, "step": 13495 }, { "epoch": 0.2015245746197896, "grad_norm": 0.5390625, "grad_norm_var": 0.004685401916503906, "learning_rate": 0.0001, "loss": 1.8687, "loss/crossentropy": 2.7559144496917725, "loss/fcd": 1.484375, "loss/idx": 11.0, "loss/logits": 0.3843352198600769, "step": 13496 }, { "epoch": 0.20153950679040458, "grad_norm": 0.337890625, "grad_norm_var": 0.004610300064086914, "learning_rate": 0.0001, "loss": 1.4498, "loss/crossentropy": 2.4420340061187744, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2154126912355423, "step": 13497 }, { "epoch": 0.20155443896101957, "grad_norm": 0.34375, "grad_norm_var": 0.004618310928344726, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.566182017326355, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18821834027767181, "step": 13498 }, { "epoch": 0.20156937113163456, "grad_norm": 0.2578125, "grad_norm_var": 0.004827626546223958, "learning_rate": 0.0001, "loss": 1.3088, "loss/crossentropy": 2.6120749711990356, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17209186404943466, "step": 13499 }, { "epoch": 0.20158430330224952, "grad_norm": 0.37109375, "grad_norm_var": 0.004857110977172852, "learning_rate": 0.0001, "loss": 1.6768, "loss/crossentropy": 2.67697536945343, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.24316389858722687, "step": 13500 }, { "epoch": 0.2015992354728645, "grad_norm": 0.26953125, "grad_norm_var": 0.005071512858072917, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.6121103763580322, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.15628880262374878, "step": 13501 }, { "epoch": 0.2016141676434795, "grad_norm": 0.337890625, "grad_norm_var": 0.004975366592407227, "learning_rate": 0.0001, "loss": 1.4637, "loss/crossentropy": 2.7993385791778564, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.22150819003582, "step": 13502 }, { "epoch": 0.20162909981409446, "grad_norm": 0.326171875, "grad_norm_var": 0.004412269592285157, "learning_rate": 0.0001, "loss": 1.4941, "loss/crossentropy": 2.8238550424575806, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2167649120092392, "step": 13503 }, { "epoch": 0.20164403198470945, "grad_norm": 0.3125, "grad_norm_var": 0.004392242431640625, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.774256944656372, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20401038229465485, "step": 13504 }, { "epoch": 0.20165896415532444, "grad_norm": 0.275390625, "grad_norm_var": 0.0042722066243489586, "learning_rate": 0.0001, "loss": 1.2341, "loss/crossentropy": 2.6585261821746826, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.15210265666246414, "step": 13505 }, { "epoch": 0.20167389632593943, "grad_norm": 0.296875, "grad_norm_var": 0.004279438654581706, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.729093074798584, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18174497783184052, "step": 13506 }, { "epoch": 0.2016888284965544, "grad_norm": 0.283203125, "grad_norm_var": 0.004279438654581706, "learning_rate": 0.0001, "loss": 1.3434, "loss/crossentropy": 2.6137579679489136, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17934371531009674, "step": 13507 }, { "epoch": 0.20170376066716939, "grad_norm": 0.29296875, "grad_norm_var": 0.004337294896443685, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.9032669067382812, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.20985833555459976, "step": 13508 }, { "epoch": 0.20171869283778437, "grad_norm": 0.2470703125, "grad_norm_var": 0.004640448093414307, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.6495940685272217, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.16980457305908203, "step": 13509 }, { "epoch": 0.20173362500839934, "grad_norm": 0.306640625, "grad_norm_var": 0.0045666337013244625, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.7050483226776123, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19262061268091202, "step": 13510 }, { "epoch": 0.20174855717901433, "grad_norm": 0.3125, "grad_norm_var": 0.004564599196116129, "learning_rate": 0.0001, "loss": 1.4647, "loss/crossentropy": 2.4950753450393677, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2029392495751381, "step": 13511 }, { "epoch": 0.20176348934962932, "grad_norm": 0.3203125, "grad_norm_var": 0.0011484106381734213, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.4556057453155518, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19391534477472305, "step": 13512 }, { "epoch": 0.20177842152024428, "grad_norm": 0.306640625, "grad_norm_var": 0.0010754227638244628, "learning_rate": 0.0001, "loss": 1.4769, "loss/crossentropy": 2.9166637659072876, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2307906448841095, "step": 13513 }, { "epoch": 0.20179335369085927, "grad_norm": 0.298828125, "grad_norm_var": 0.0009620944658915202, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.6776115894317627, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.21047575771808624, "step": 13514 }, { "epoch": 0.20180828586147426, "grad_norm": 0.31640625, "grad_norm_var": 0.000839547316233317, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.230055570602417, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.17716246098279953, "step": 13515 }, { "epoch": 0.20182321803208925, "grad_norm": 0.26953125, "grad_norm_var": 0.0005841533342997233, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.5101972818374634, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1873672902584076, "step": 13516 }, { "epoch": 0.2018381502027042, "grad_norm": 0.31640625, "grad_norm_var": 0.0005418101946512859, "learning_rate": 0.0001, "loss": 1.4526, "loss/crossentropy": 2.7710381746292114, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.19872693717479706, "step": 13517 }, { "epoch": 0.2018530823733192, "grad_norm": 0.298828125, "grad_norm_var": 0.00044612487157185874, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.5535610914230347, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2053781896829605, "step": 13518 }, { "epoch": 0.2018680145439342, "grad_norm": 0.345703125, "grad_norm_var": 0.0005413333574930827, "learning_rate": 0.0001, "loss": 1.4674, "loss/crossentropy": 2.5557918548583984, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.20572268217802048, "step": 13519 }, { "epoch": 0.20188294671454915, "grad_norm": 0.291015625, "grad_norm_var": 0.0005343397458394369, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.5693126916885376, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1837771087884903, "step": 13520 }, { "epoch": 0.20189787888516414, "grad_norm": 0.3125, "grad_norm_var": 0.0005053480466206869, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.361016869544983, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19532669335603714, "step": 13521 }, { "epoch": 0.20191281105577913, "grad_norm": 0.30859375, "grad_norm_var": 0.0005075414975484212, "learning_rate": 0.0001, "loss": 1.3626, "loss/crossentropy": 2.792709469795227, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18682987987995148, "step": 13522 }, { "epoch": 0.20192774322639412, "grad_norm": 0.373046875, "grad_norm_var": 0.0007904966672261556, "learning_rate": 0.0001, "loss": 1.8161, "loss/crossentropy": 2.847914457321167, "loss/fcd": 1.51171875, "loss/idx": 11.0, "loss/logits": 0.3043588325381279, "step": 13523 }, { "epoch": 0.20194267539700908, "grad_norm": 0.2734375, "grad_norm_var": 0.0008516907691955566, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.60882306098938, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17304813861846924, "step": 13524 }, { "epoch": 0.20195760756762407, "grad_norm": 0.2734375, "grad_norm_var": 0.0006876468658447265, "learning_rate": 0.0001, "loss": 1.2454, "loss/crossentropy": 2.5617446899414062, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.15943042933940887, "step": 13525 }, { "epoch": 0.20197253973823906, "grad_norm": 0.3203125, "grad_norm_var": 0.00069732666015625, "learning_rate": 0.0001, "loss": 1.2476, "loss/crossentropy": 2.3649535179138184, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.15386808663606644, "step": 13526 }, { "epoch": 0.20198747190885402, "grad_norm": 0.255859375, "grad_norm_var": 0.0008683363596598307, "learning_rate": 0.0001, "loss": 1.2935, "loss/crossentropy": 2.4911550283432007, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.17635741084814072, "step": 13527 }, { "epoch": 0.202002404079469, "grad_norm": 0.263671875, "grad_norm_var": 0.0009536107381184895, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.5343321561813354, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19108322262763977, "step": 13528 }, { "epoch": 0.202017336250084, "grad_norm": 0.294921875, "grad_norm_var": 0.0009541829427083333, "learning_rate": 0.0001, "loss": 1.4252, "loss/crossentropy": 2.685283660888672, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20642736554145813, "step": 13529 }, { "epoch": 0.20203226842069896, "grad_norm": 0.283203125, "grad_norm_var": 0.0009735107421875, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.3138504028320312, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22653459012508392, "step": 13530 }, { "epoch": 0.20204720059131395, "grad_norm": 0.330078125, "grad_norm_var": 0.0010154565175374349, "learning_rate": 0.0001, "loss": 1.5588, "loss/crossentropy": 2.841793179512024, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.24240174889564514, "step": 13531 }, { "epoch": 0.20206213276192894, "grad_norm": 0.267578125, "grad_norm_var": 0.0010238011678059896, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.2338808178901672, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2251320704817772, "step": 13532 }, { "epoch": 0.20207706493254393, "grad_norm": 0.3046875, "grad_norm_var": 0.0010075887044270833, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.4041707515716553, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20391004532575607, "step": 13533 }, { "epoch": 0.2020919971031589, "grad_norm": 0.376953125, "grad_norm_var": 0.0013788859049479166, "learning_rate": 0.0001, "loss": 1.5177, "loss/crossentropy": 2.5019463300704956, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.23251957446336746, "step": 13534 }, { "epoch": 0.20210692927377388, "grad_norm": 0.265625, "grad_norm_var": 0.0013417402903238933, "learning_rate": 0.0001, "loss": 1.4365, "loss/crossentropy": 2.3696192502975464, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2060244381427765, "step": 13535 }, { "epoch": 0.20212186144438887, "grad_norm": 0.337890625, "grad_norm_var": 0.0014249006907145181, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.5717159509658813, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.16399532556533813, "step": 13536 }, { "epoch": 0.20213679361500383, "grad_norm": 0.3046875, "grad_norm_var": 0.0014184157053629557, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.7671360969543457, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2126702517271042, "step": 13537 }, { "epoch": 0.20215172578561882, "grad_norm": 0.345703125, "grad_norm_var": 0.0015364964803059897, "learning_rate": 0.0001, "loss": 1.4431, "loss/crossentropy": 2.549026370048523, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.2008749544620514, "step": 13538 }, { "epoch": 0.20216665795623381, "grad_norm": 0.3359375, "grad_norm_var": 0.001283121109008789, "learning_rate": 0.0001, "loss": 1.5747, "loss/crossentropy": 2.6249877214431763, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.2426825910806656, "step": 13539 }, { "epoch": 0.20218159012684878, "grad_norm": 0.2890625, "grad_norm_var": 0.0012386163075764974, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.713282346725464, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1896824613213539, "step": 13540 }, { "epoch": 0.20219652229746377, "grad_norm": 0.3125, "grad_norm_var": 0.0011794884999593099, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.671935558319092, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21773944050073624, "step": 13541 }, { "epoch": 0.20221145446807876, "grad_norm": 0.27734375, "grad_norm_var": 0.0012102603912353515, "learning_rate": 0.0001, "loss": 1.2644, "loss/crossentropy": 2.5966588258743286, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.16283436119556427, "step": 13542 }, { "epoch": 0.20222638663869374, "grad_norm": 0.345703125, "grad_norm_var": 0.0011517683664957683, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.254077672958374, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.19625736773014069, "step": 13543 }, { "epoch": 0.2022413188093087, "grad_norm": 0.310546875, "grad_norm_var": 0.0010090986887613931, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.4415804147720337, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19266639649868011, "step": 13544 }, { "epoch": 0.2022562509799237, "grad_norm": 0.275390625, "grad_norm_var": 0.0010758558909098306, "learning_rate": 0.0001, "loss": 1.3766, "loss/crossentropy": 2.421581506729126, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18518368154764175, "step": 13545 }, { "epoch": 0.20227118315053869, "grad_norm": 0.283203125, "grad_norm_var": 0.0010758558909098306, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.5937269926071167, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19548721611499786, "step": 13546 }, { "epoch": 0.20228611532115365, "grad_norm": 0.27734375, "grad_norm_var": 0.001109759012858073, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.4609768390655518, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1967041864991188, "step": 13547 }, { "epoch": 0.20230104749176864, "grad_norm": 0.306640625, "grad_norm_var": 0.0010004043579101562, "learning_rate": 0.0001, "loss": 1.3276, "loss/crossentropy": 2.57033908367157, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17917967587709427, "step": 13548 }, { "epoch": 0.20231597966238363, "grad_norm": 0.2890625, "grad_norm_var": 0.0010253270467122397, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.577246069908142, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21832305938005447, "step": 13549 }, { "epoch": 0.20233091183299862, "grad_norm": 0.26953125, "grad_norm_var": 0.0007639408111572265, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.6634737253189087, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18387311697006226, "step": 13550 }, { "epoch": 0.20234584400361358, "grad_norm": 0.31640625, "grad_norm_var": 0.0006812890370686849, "learning_rate": 0.0001, "loss": 1.4993, "loss/crossentropy": 2.4538925886154175, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.24151530861854553, "step": 13551 }, { "epoch": 0.20236077617422857, "grad_norm": 0.44140625, "grad_norm_var": 0.0018075942993164063, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.6022536754608154, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.22012421488761902, "step": 13552 }, { "epoch": 0.20237570834484356, "grad_norm": 0.2890625, "grad_norm_var": 0.0018365859985351562, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.4793903827667236, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19057603180408478, "step": 13553 }, { "epoch": 0.20239064051545852, "grad_norm": 0.439453125, "grad_norm_var": 0.0028284072875976564, "learning_rate": 0.0001, "loss": 1.613, "loss/crossentropy": 2.396871328353882, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.20671872049570084, "step": 13554 }, { "epoch": 0.2024055726860735, "grad_norm": 0.294921875, "grad_norm_var": 0.0028254032135009766, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.594962000846863, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.18142830580472946, "step": 13555 }, { "epoch": 0.2024205048566885, "grad_norm": 0.361328125, "grad_norm_var": 0.0029153823852539062, "learning_rate": 0.0001, "loss": 1.612, "loss/crossentropy": 2.507521390914917, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.25257936120033264, "step": 13556 }, { "epoch": 0.20243543702730346, "grad_norm": 0.30078125, "grad_norm_var": 0.0029327392578125, "learning_rate": 0.0001, "loss": 1.5294, "loss/crossentropy": 2.66399347782135, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2286004200577736, "step": 13557 }, { "epoch": 0.20245036919791845, "grad_norm": 0.341796875, "grad_norm_var": 0.0028482913970947266, "learning_rate": 0.0001, "loss": 1.6258, "loss/crossentropy": 2.72937548160553, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.28599704802036285, "step": 13558 }, { "epoch": 0.20246530136853344, "grad_norm": 0.296875, "grad_norm_var": 0.0028391520182291666, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.4801021814346313, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22987038642168045, "step": 13559 }, { "epoch": 0.20248023353914843, "grad_norm": 0.259765625, "grad_norm_var": 0.0030532201131184896, "learning_rate": 0.0001, "loss": 1.2975, "loss/crossentropy": 2.3651691675186157, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17250490188598633, "step": 13560 }, { "epoch": 0.2024951657097634, "grad_norm": 0.279296875, "grad_norm_var": 0.003033447265625, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.714945673942566, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.1823030710220337, "step": 13561 }, { "epoch": 0.20251009788037838, "grad_norm": 0.267578125, "grad_norm_var": 0.0031158447265625, "learning_rate": 0.0001, "loss": 1.3174, "loss/crossentropy": 2.6662570238113403, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.18068329244852066, "step": 13562 }, { "epoch": 0.20252503005099337, "grad_norm": 0.3125, "grad_norm_var": 0.0030191421508789064, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.583932042121887, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.20284567028284073, "step": 13563 }, { "epoch": 0.20253996222160833, "grad_norm": 0.31640625, "grad_norm_var": 0.0030120690663655598, "learning_rate": 0.0001, "loss": 1.4669, "loss/crossentropy": 2.7394967079162598, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20906003564596176, "step": 13564 }, { "epoch": 0.20255489439222332, "grad_norm": 0.298828125, "grad_norm_var": 0.0029813130696614582, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.663113594055176, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18950382620096207, "step": 13565 }, { "epoch": 0.2025698265628383, "grad_norm": 0.287109375, "grad_norm_var": 0.0028873284657796225, "learning_rate": 0.0001, "loss": 1.3325, "loss/crossentropy": 2.713585615158081, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17625346779823303, "step": 13566 }, { "epoch": 0.2025847587334533, "grad_norm": 0.310546875, "grad_norm_var": 0.0028914769490559896, "learning_rate": 0.0001, "loss": 1.3729, "loss/crossentropy": 2.5179678201675415, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1971408799290657, "step": 13567 }, { "epoch": 0.20259969090406826, "grad_norm": 0.28125, "grad_norm_var": 0.00187225341796875, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.621671199798584, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17303279787302017, "step": 13568 }, { "epoch": 0.20261462307468325, "grad_norm": 0.330078125, "grad_norm_var": 0.001870584487915039, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.554620862007141, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21908074617385864, "step": 13569 }, { "epoch": 0.20262955524529824, "grad_norm": 0.341796875, "grad_norm_var": 0.0007961114247639974, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.4659332036972046, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.20003344118595123, "step": 13570 }, { "epoch": 0.2026444874159132, "grad_norm": 0.271484375, "grad_norm_var": 0.0008621056874593099, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.4932122230529785, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17022813856601715, "step": 13571 }, { "epoch": 0.2026594195865282, "grad_norm": 0.31640625, "grad_norm_var": 0.00064239501953125, "learning_rate": 0.0001, "loss": 1.4242, "loss/crossentropy": 2.656033515930176, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20549418777227402, "step": 13572 }, { "epoch": 0.20267435175714318, "grad_norm": 0.275390625, "grad_norm_var": 0.0006826877593994141, "learning_rate": 0.0001, "loss": 1.3579, "loss/crossentropy": 2.754407286643982, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18600938469171524, "step": 13573 }, { "epoch": 0.20268928392775815, "grad_norm": 0.2890625, "grad_norm_var": 0.00055694580078125, "learning_rate": 0.0001, "loss": 1.49, "loss/crossentropy": 2.518222451210022, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21654020249843597, "step": 13574 }, { "epoch": 0.20270421609837314, "grad_norm": 0.287109375, "grad_norm_var": 0.0005616346995035807, "learning_rate": 0.0001, "loss": 1.4339, "loss/crossentropy": 2.564510703086853, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.2034498229622841, "step": 13575 }, { "epoch": 0.20271914826898813, "grad_norm": 0.291015625, "grad_norm_var": 0.0004746596018473307, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.6496870517730713, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2048225849866867, "step": 13576 }, { "epoch": 0.20273408043960311, "grad_norm": 0.322265625, "grad_norm_var": 0.00048724810282389325, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.6206297874450684, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.219501331448555, "step": 13577 }, { "epoch": 0.20274901261021808, "grad_norm": 0.310546875, "grad_norm_var": 0.0004173119862874349, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.768373489379883, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.19812826812267303, "step": 13578 }, { "epoch": 0.20276394478083307, "grad_norm": 0.2890625, "grad_norm_var": 0.0004207452138264974, "learning_rate": 0.0001, "loss": 1.3163, "loss/crossentropy": 2.6333826780319214, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17569589614868164, "step": 13579 }, { "epoch": 0.20277887695144806, "grad_norm": 0.345703125, "grad_norm_var": 0.0005339940388997395, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.738720178604126, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21767109632492065, "step": 13580 }, { "epoch": 0.20279380912206302, "grad_norm": 0.267578125, "grad_norm_var": 0.000612322489420573, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.627697467803955, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.1696070432662964, "step": 13581 }, { "epoch": 0.202808741292678, "grad_norm": 0.34375, "grad_norm_var": 0.0007077376047770182, "learning_rate": 0.0001, "loss": 1.611, "loss/crossentropy": 2.6378530263900757, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.25945399701595306, "step": 13582 }, { "epoch": 0.202823673463293, "grad_norm": 0.359375, "grad_norm_var": 0.00089569091796875, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.5401084423065186, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20933163166046143, "step": 13583 }, { "epoch": 0.202838605633908, "grad_norm": 0.296875, "grad_norm_var": 0.00085601806640625, "learning_rate": 0.0001, "loss": 1.4855, "loss/crossentropy": 2.5110164880752563, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21984515339136124, "step": 13584 }, { "epoch": 0.20285353780452295, "grad_norm": 0.3359375, "grad_norm_var": 0.0008749485015869141, "learning_rate": 0.0001, "loss": 1.5108, "loss/crossentropy": 2.48112416267395, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23341362923383713, "step": 13585 }, { "epoch": 0.20286846997513794, "grad_norm": 0.30859375, "grad_norm_var": 0.0007984797159830729, "learning_rate": 0.0001, "loss": 1.4924, "loss/crossentropy": 2.6857728958129883, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.22674061357975006, "step": 13586 }, { "epoch": 0.20288340214575293, "grad_norm": 0.30859375, "grad_norm_var": 0.0007093906402587891, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.5207319259643555, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19541172683238983, "step": 13587 }, { "epoch": 0.2028983343163679, "grad_norm": 0.2890625, "grad_norm_var": 0.0007298628489176433, "learning_rate": 0.0001, "loss": 1.4443, "loss/crossentropy": 2.3356233835220337, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.1982528492808342, "step": 13588 }, { "epoch": 0.20291326648698288, "grad_norm": 0.28515625, "grad_norm_var": 0.0006940205891927083, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.69681453704834, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18089528381824493, "step": 13589 }, { "epoch": 0.20292819865759787, "grad_norm": 0.310546875, "grad_norm_var": 0.0006683190663655599, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.5550429821014404, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19174087792634964, "step": 13590 }, { "epoch": 0.20294313082821283, "grad_norm": 0.36328125, "grad_norm_var": 0.0008040746053059896, "learning_rate": 0.0001, "loss": 1.5427, "loss/crossentropy": 2.603380560874939, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.22625982016324997, "step": 13591 }, { "epoch": 0.20295806299882782, "grad_norm": 0.283203125, "grad_norm_var": 0.0008320490519205729, "learning_rate": 0.0001, "loss": 1.269, "loss/crossentropy": 2.5737918615341187, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.15573832392692566, "step": 13592 }, { "epoch": 0.2029729951694428, "grad_norm": 0.30859375, "grad_norm_var": 0.0008281548817952474, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.634896755218506, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.1939414218068123, "step": 13593 }, { "epoch": 0.2029879273400578, "grad_norm": 0.35546875, "grad_norm_var": 0.000940386454264323, "learning_rate": 0.0001, "loss": 1.5945, "loss/crossentropy": 2.368237018585205, "loss/fcd": 1.390625, "loss/idx": 11.0, "loss/logits": 0.20387675613164902, "step": 13594 }, { "epoch": 0.20300285951067276, "grad_norm": 0.2890625, "grad_norm_var": 0.000940386454264323, "learning_rate": 0.0001, "loss": 1.3679, "loss/crossentropy": 2.459987998008728, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.17651138454675674, "step": 13595 }, { "epoch": 0.20301779168128775, "grad_norm": 0.294921875, "grad_norm_var": 0.0008982340494791667, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.689201593399048, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19888315349817276, "step": 13596 }, { "epoch": 0.20303272385190274, "grad_norm": 0.29296875, "grad_norm_var": 0.000786447525024414, "learning_rate": 0.0001, "loss": 1.3662, "loss/crossentropy": 2.506938099861145, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18256369978189468, "step": 13597 }, { "epoch": 0.2030476560225177, "grad_norm": 0.279296875, "grad_norm_var": 0.000791168212890625, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.576504349708557, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.2159714251756668, "step": 13598 }, { "epoch": 0.2030625881931327, "grad_norm": 0.421875, "grad_norm_var": 0.0014462788899739583, "learning_rate": 0.0001, "loss": 1.304, "loss/crossentropy": 2.369835138320923, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.15165331959724426, "step": 13599 }, { "epoch": 0.20307752036374768, "grad_norm": 0.30078125, "grad_norm_var": 0.0014383316040039063, "learning_rate": 0.0001, "loss": 1.4662, "loss/crossentropy": 2.493812084197998, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.20058805495500565, "step": 13600 }, { "epoch": 0.20309245253436264, "grad_norm": 0.32421875, "grad_norm_var": 0.0014129638671875, "learning_rate": 0.0001, "loss": 1.4537, "loss/crossentropy": 2.4422348737716675, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20762774348258972, "step": 13601 }, { "epoch": 0.20310738470497763, "grad_norm": 0.29296875, "grad_norm_var": 0.0014383951822916667, "learning_rate": 0.0001, "loss": 1.4416, "loss/crossentropy": 2.547217607498169, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21108496189117432, "step": 13602 }, { "epoch": 0.20312231687559262, "grad_norm": 0.314453125, "grad_norm_var": 0.0014374891916910807, "learning_rate": 0.0001, "loss": 1.4364, "loss/crossentropy": 2.4223062992095947, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20983575284481049, "step": 13603 }, { "epoch": 0.2031372490462076, "grad_norm": 0.30078125, "grad_norm_var": 0.0014088789621988932, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.6179128885269165, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2017051726579666, "step": 13604 }, { "epoch": 0.20315218121682257, "grad_norm": 0.3046875, "grad_norm_var": 0.0013586521148681641, "learning_rate": 0.0001, "loss": 1.5587, "loss/crossentropy": 2.5569159984588623, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23448950052261353, "step": 13605 }, { "epoch": 0.20316711338743756, "grad_norm": 0.310546875, "grad_norm_var": 0.0013586521148681641, "learning_rate": 0.0001, "loss": 1.388, "loss/crossentropy": 2.5542664527893066, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.1849123314023018, "step": 13606 }, { "epoch": 0.20318204555805255, "grad_norm": 0.29296875, "grad_norm_var": 0.0012133121490478516, "learning_rate": 0.0001, "loss": 1.3327, "loss/crossentropy": 2.637412190437317, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1764838844537735, "step": 13607 }, { "epoch": 0.20319697772866752, "grad_norm": 0.3046875, "grad_norm_var": 0.0011641820271809896, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.4217183589935303, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.17308593541383743, "step": 13608 }, { "epoch": 0.2032119098992825, "grad_norm": 0.287109375, "grad_norm_var": 0.0012021223704020182, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.502273201942444, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20577828586101532, "step": 13609 }, { "epoch": 0.2032268420698975, "grad_norm": 0.283203125, "grad_norm_var": 0.001094500223795573, "learning_rate": 0.0001, "loss": 1.346, "loss/crossentropy": 2.4534354209899902, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1780681386590004, "step": 13610 }, { "epoch": 0.20324177424051248, "grad_norm": 0.2890625, "grad_norm_var": 0.001094500223795573, "learning_rate": 0.0001, "loss": 1.4121, "loss/crossentropy": 2.403549551963806, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19332516938447952, "step": 13611 }, { "epoch": 0.20325670641112745, "grad_norm": 0.474609375, "grad_norm_var": 0.0028492609659830728, "learning_rate": 0.0001, "loss": 1.6927, "loss/crossentropy": 2.5078539848327637, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.2591291069984436, "step": 13612 }, { "epoch": 0.20327163858174244, "grad_norm": 0.271484375, "grad_norm_var": 0.002947346369425456, "learning_rate": 0.0001, "loss": 1.366, "loss/crossentropy": 2.4803918600082397, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18635905534029007, "step": 13613 }, { "epoch": 0.20328657075235743, "grad_norm": 0.30078125, "grad_norm_var": 0.0028716405232747395, "learning_rate": 0.0001, "loss": 1.3247, "loss/crossentropy": 2.8381699323654175, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.17627203464508057, "step": 13614 }, { "epoch": 0.2033015029229724, "grad_norm": 0.298828125, "grad_norm_var": 0.00209959348042806, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.3516945838928223, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18277256190776825, "step": 13615 }, { "epoch": 0.20331643509358738, "grad_norm": 0.3671875, "grad_norm_var": 0.0022984663645426433, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.6289215087890625, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.2092382311820984, "step": 13616 }, { "epoch": 0.20333136726420237, "grad_norm": 0.326171875, "grad_norm_var": 0.002301470438639323, "learning_rate": 0.0001, "loss": 1.4038, "loss/crossentropy": 2.465965509414673, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.2045627012848854, "step": 13617 }, { "epoch": 0.20334629943481733, "grad_norm": 0.318359375, "grad_norm_var": 0.0022715091705322265, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.634901285171509, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1843404471874237, "step": 13618 }, { "epoch": 0.20336123160543232, "grad_norm": 0.291015625, "grad_norm_var": 0.002308511734008789, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.273048758506775, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2174377292394638, "step": 13619 }, { "epoch": 0.2033761637760473, "grad_norm": 0.294921875, "grad_norm_var": 0.00232086181640625, "learning_rate": 0.0001, "loss": 1.3226, "loss/crossentropy": 2.534210443496704, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1741526946425438, "step": 13620 }, { "epoch": 0.2033910959466623, "grad_norm": 0.28515625, "grad_norm_var": 0.002367591857910156, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.526299238204956, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.22707626968622208, "step": 13621 }, { "epoch": 0.20340602811727726, "grad_norm": 0.26953125, "grad_norm_var": 0.0024820804595947266, "learning_rate": 0.0001, "loss": 1.1999, "loss/crossentropy": 2.55755352973938, "loss/fcd": 1.046875, "loss/idx": 11.0, "loss/logits": 0.15307249128818512, "step": 13622 }, { "epoch": 0.20342096028789225, "grad_norm": 0.279296875, "grad_norm_var": 0.002524248758951823, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.845118999481201, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17864470928907394, "step": 13623 }, { "epoch": 0.20343589245850724, "grad_norm": 0.259765625, "grad_norm_var": 0.0026752312978108725, "learning_rate": 0.0001, "loss": 1.3359, "loss/crossentropy": 2.552385091781616, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.17188411206007004, "step": 13624 }, { "epoch": 0.2034508246291222, "grad_norm": 0.3046875, "grad_norm_var": 0.0026501973470052084, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.571877598762512, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.20302454382181168, "step": 13625 }, { "epoch": 0.2034657567997372, "grad_norm": 0.357421875, "grad_norm_var": 0.0027577082316080728, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 2.6650564670562744, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.21932553499937057, "step": 13626 }, { "epoch": 0.20348068897035218, "grad_norm": 0.3046875, "grad_norm_var": 0.002725664774576823, "learning_rate": 0.0001, "loss": 1.3113, "loss/crossentropy": 2.698180675506592, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.18239659070968628, "step": 13627 }, { "epoch": 0.20349562114096717, "grad_norm": 0.306640625, "grad_norm_var": 0.0008639017740885417, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.6782091856002808, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18614523857831955, "step": 13628 }, { "epoch": 0.20351055331158213, "grad_norm": 0.322265625, "grad_norm_var": 0.0008167902628580729, "learning_rate": 0.0001, "loss": 1.3626, "loss/crossentropy": 2.7306324243545532, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1907714679837227, "step": 13629 }, { "epoch": 0.20352548548219712, "grad_norm": 0.294921875, "grad_norm_var": 0.0008225599924723307, "learning_rate": 0.0001, "loss": 1.2612, "loss/crossentropy": 2.674400210380554, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.1674618050456047, "step": 13630 }, { "epoch": 0.2035404176528121, "grad_norm": 0.306640625, "grad_norm_var": 0.0008198897043863933, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.874918580055237, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19447441399097443, "step": 13631 }, { "epoch": 0.20355534982342707, "grad_norm": 0.298828125, "grad_norm_var": 0.000550079345703125, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.65058696269989, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1902007907629013, "step": 13632 }, { "epoch": 0.20357028199404206, "grad_norm": 0.2734375, "grad_norm_var": 0.0005487918853759766, "learning_rate": 0.0001, "loss": 1.2417, "loss/crossentropy": 2.573835015296936, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.15968850255012512, "step": 13633 }, { "epoch": 0.20358521416465705, "grad_norm": 0.296875, "grad_norm_var": 0.0005192438761393229, "learning_rate": 0.0001, "loss": 1.3136, "loss/crossentropy": 2.4622198343276978, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.1768934577703476, "step": 13634 }, { "epoch": 0.20360014633527201, "grad_norm": 0.3125, "grad_norm_var": 0.0005320072174072266, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.3633676767349243, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.18497268855571747, "step": 13635 }, { "epoch": 0.203615078505887, "grad_norm": 0.2470703125, "grad_norm_var": 0.000694588820139567, "learning_rate": 0.0001, "loss": 1.2461, "loss/crossentropy": 2.5840275287628174, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.1640862226486206, "step": 13636 }, { "epoch": 0.203630010676502, "grad_norm": 0.322265625, "grad_norm_var": 0.0007320364316304524, "learning_rate": 0.0001, "loss": 1.4202, "loss/crossentropy": 2.705665707588196, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2014467716217041, "step": 13637 }, { "epoch": 0.20364494284711698, "grad_norm": 0.31640625, "grad_norm_var": 0.0006957968076070149, "learning_rate": 0.0001, "loss": 1.3069, "loss/crossentropy": 2.56251060962677, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.1623866930603981, "step": 13638 }, { "epoch": 0.20365987501773195, "grad_norm": 0.34765625, "grad_norm_var": 0.0007970452308654786, "learning_rate": 0.0001, "loss": 1.5525, "loss/crossentropy": 2.684291124343872, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.26339617371559143, "step": 13639 }, { "epoch": 0.20367480718834693, "grad_norm": 0.26953125, "grad_norm_var": 0.0007447520891825358, "learning_rate": 0.0001, "loss": 1.3883, "loss/crossentropy": 2.3560606241226196, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19302856922149658, "step": 13640 }, { "epoch": 0.20368973935896192, "grad_norm": 0.298828125, "grad_norm_var": 0.0007472316424051921, "learning_rate": 0.0001, "loss": 1.4471, "loss/crossentropy": 2.6294453144073486, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20486661046743393, "step": 13641 }, { "epoch": 0.2037046715295769, "grad_norm": 0.33203125, "grad_norm_var": 0.0006092031796773275, "learning_rate": 0.0001, "loss": 1.557, "loss/crossentropy": 2.4613258838653564, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22500424087047577, "step": 13642 }, { "epoch": 0.20371960370019188, "grad_norm": 0.30078125, "grad_norm_var": 0.0006093621253967285, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.7427316904067993, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.23361192643642426, "step": 13643 }, { "epoch": 0.20373453587080687, "grad_norm": 0.296875, "grad_norm_var": 0.0006104747454325359, "learning_rate": 0.0001, "loss": 1.3547, "loss/crossentropy": 2.3895853757858276, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18280144035816193, "step": 13644 }, { "epoch": 0.20374946804142186, "grad_norm": 0.322265625, "grad_norm_var": 0.0006104747454325359, "learning_rate": 0.0001, "loss": 1.6092, "loss/crossentropy": 2.4460333585739136, "loss/fcd": 1.3671875, "loss/idx": 11.0, "loss/logits": 0.2419978529214859, "step": 13645 }, { "epoch": 0.20376440021203682, "grad_norm": 0.291015625, "grad_norm_var": 0.0006152749061584473, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.702371835708618, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.17240743339061737, "step": 13646 }, { "epoch": 0.2037793323826518, "grad_norm": 0.310546875, "grad_norm_var": 0.0006186127662658692, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.514154314994812, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.1837855502963066, "step": 13647 }, { "epoch": 0.2037942645532668, "grad_norm": 0.369140625, "grad_norm_var": 0.0008949875831604004, "learning_rate": 0.0001, "loss": 1.4995, "loss/crossentropy": 2.697448253631592, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2182483822107315, "step": 13648 }, { "epoch": 0.20380919672388176, "grad_norm": 0.29296875, "grad_norm_var": 0.000832204023996989, "learning_rate": 0.0001, "loss": 1.4868, "loss/crossentropy": 2.343497157096863, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.21337486058473587, "step": 13649 }, { "epoch": 0.20382412889449675, "grad_norm": 0.36328125, "grad_norm_var": 0.0010100007057189941, "learning_rate": 0.0001, "loss": 1.7316, "loss/crossentropy": 2.5485148429870605, "loss/fcd": 1.4609375, "loss/idx": 11.0, "loss/logits": 0.2706357464194298, "step": 13650 }, { "epoch": 0.20383906106511174, "grad_norm": 0.294921875, "grad_norm_var": 0.0010283112525939942, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.780521273612976, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19587098062038422, "step": 13651 }, { "epoch": 0.2038539932357267, "grad_norm": 0.283203125, "grad_norm_var": 0.0008020401000976562, "learning_rate": 0.0001, "loss": 1.5084, "loss/crossentropy": 2.642236590385437, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23109839856624603, "step": 13652 }, { "epoch": 0.2038689254063417, "grad_norm": 0.3828125, "grad_norm_var": 0.0011040846506754558, "learning_rate": 0.0001, "loss": 1.6605, "loss/crossentropy": 2.721239924430847, "loss/fcd": 1.40625, "loss/idx": 11.0, "loss/logits": 0.25427352637052536, "step": 13653 }, { "epoch": 0.20388385757695668, "grad_norm": 0.3203125, "grad_norm_var": 0.00110472043355306, "learning_rate": 0.0001, "loss": 1.5603, "loss/crossentropy": 2.2359143495559692, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.22438238561153412, "step": 13654 }, { "epoch": 0.20389878974757167, "grad_norm": 0.27734375, "grad_norm_var": 0.0011287530263264973, "learning_rate": 0.0001, "loss": 1.4046, "loss/crossentropy": 2.5308589935302734, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19754404574632645, "step": 13655 }, { "epoch": 0.20391372191818663, "grad_norm": 0.31640625, "grad_norm_var": 0.0009952386220296223, "learning_rate": 0.0001, "loss": 1.6103, "loss/crossentropy": 2.277108669281006, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.23140807449817657, "step": 13656 }, { "epoch": 0.20392865408880162, "grad_norm": 0.3046875, "grad_norm_var": 0.0009841283162434896, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.7627750635147095, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.21389537304639816, "step": 13657 }, { "epoch": 0.2039435862594166, "grad_norm": 0.3125, "grad_norm_var": 0.000966644287109375, "learning_rate": 0.0001, "loss": 1.4685, "loss/crossentropy": 2.568825602531433, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.22238192707300186, "step": 13658 }, { "epoch": 0.20395851843003157, "grad_norm": 0.36328125, "grad_norm_var": 0.0010927836100260416, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.549894690513611, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.21126137673854828, "step": 13659 }, { "epoch": 0.20397345060064656, "grad_norm": 0.298828125, "grad_norm_var": 0.0010872999827067056, "learning_rate": 0.0001, "loss": 1.4069, "loss/crossentropy": 2.8622299432754517, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20373055338859558, "step": 13660 }, { "epoch": 0.20398838277126155, "grad_norm": 0.28125, "grad_norm_var": 0.0011744181315104167, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.5429863929748535, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19464663416147232, "step": 13661 }, { "epoch": 0.2040033149418765, "grad_norm": 0.29296875, "grad_norm_var": 0.0011680444081624349, "learning_rate": 0.0001, "loss": 1.5378, "loss/crossentropy": 2.3023018836975098, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2370508685708046, "step": 13662 }, { "epoch": 0.2040182471124915, "grad_norm": 0.302734375, "grad_norm_var": 0.0011780897776285807, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.426972985267639, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.2158840373158455, "step": 13663 }, { "epoch": 0.2040331792831065, "grad_norm": 0.29296875, "grad_norm_var": 0.0010014216105143229, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5548588037490845, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20230069011449814, "step": 13664 }, { "epoch": 0.20404811145372148, "grad_norm": 0.279296875, "grad_norm_var": 0.0010464827219645182, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.3123377561569214, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2002417892217636, "step": 13665 }, { "epoch": 0.20406304362433644, "grad_norm": 0.302734375, "grad_norm_var": 0.0008488972981770833, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.62056040763855, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21545174717903137, "step": 13666 }, { "epoch": 0.20407797579495143, "grad_norm": 0.330078125, "grad_norm_var": 0.0008712132771809896, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.489063858985901, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19555962085723877, "step": 13667 }, { "epoch": 0.20409290796556642, "grad_norm": 0.43359375, "grad_norm_var": 0.0017707665761311849, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.673144817352295, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18358969688415527, "step": 13668 }, { "epoch": 0.20410784013618138, "grad_norm": 0.28515625, "grad_norm_var": 0.0015259901682535806, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.4618364572525024, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17956037819385529, "step": 13669 }, { "epoch": 0.20412277230679637, "grad_norm": 0.28125, "grad_norm_var": 0.0015787601470947266, "learning_rate": 0.0001, "loss": 1.3022, "loss/crossentropy": 2.6603697538375854, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.18504515290260315, "step": 13670 }, { "epoch": 0.20413770447741136, "grad_norm": 0.322265625, "grad_norm_var": 0.0015111287434895833, "learning_rate": 0.0001, "loss": 1.4462, "loss/crossentropy": 2.573526978492737, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20405928045511246, "step": 13671 }, { "epoch": 0.20415263664802635, "grad_norm": 0.302734375, "grad_norm_var": 0.0015156904856363932, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.5341440439224243, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.20408976823091507, "step": 13672 }, { "epoch": 0.20416756881864132, "grad_norm": 0.267578125, "grad_norm_var": 0.001636187235514323, "learning_rate": 0.0001, "loss": 1.2322, "loss/crossentropy": 2.674000382423401, "loss/fcd": 1.0703125, "loss/idx": 11.0, "loss/logits": 0.16183965653181076, "step": 13673 }, { "epoch": 0.2041825009892563, "grad_norm": 0.3515625, "grad_norm_var": 0.0017480850219726562, "learning_rate": 0.0001, "loss": 1.5101, "loss/crossentropy": 2.569013237953186, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.20536918193101883, "step": 13674 }, { "epoch": 0.2041974331598713, "grad_norm": 0.412109375, "grad_norm_var": 0.0022324721018473306, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.7428014278411865, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18490077555179596, "step": 13675 }, { "epoch": 0.20421236533048626, "grad_norm": 0.6484375, "grad_norm_var": 0.009126218159993489, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.4856032133102417, "loss/fcd": 1.3515625, "loss/idx": 11.0, "loss/logits": 0.21808882057666779, "step": 13676 }, { "epoch": 0.20422729750110125, "grad_norm": 0.271484375, "grad_norm_var": 0.009204339981079102, "learning_rate": 0.0001, "loss": 1.2814, "loss/crossentropy": 2.5941030979156494, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1759587973356247, "step": 13677 }, { "epoch": 0.20424222967171624, "grad_norm": 0.30078125, "grad_norm_var": 0.009163268407185872, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.661069631576538, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.197382353246212, "step": 13678 }, { "epoch": 0.2042571618423312, "grad_norm": 0.302734375, "grad_norm_var": 0.009163268407185872, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.5762689113616943, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.22278330475091934, "step": 13679 }, { "epoch": 0.2042720940129462, "grad_norm": 0.361328125, "grad_norm_var": 0.009058125813802083, "learning_rate": 0.0001, "loss": 1.693, "loss/crossentropy": 2.514228582382202, "loss/fcd": 1.42578125, "loss/idx": 11.0, "loss/logits": 0.2672334909439087, "step": 13680 }, { "epoch": 0.20428702618356118, "grad_norm": 0.267578125, "grad_norm_var": 0.009162839253743489, "learning_rate": 0.0001, "loss": 1.2489, "loss/crossentropy": 2.7156795263290405, "loss/fcd": 1.08203125, "loss/idx": 11.0, "loss/logits": 0.1668890044093132, "step": 13681 }, { "epoch": 0.20430195835417617, "grad_norm": 0.275390625, "grad_norm_var": 0.009345753987630209, "learning_rate": 0.0001, "loss": 1.3695, "loss/crossentropy": 2.587646007537842, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.18592185527086258, "step": 13682 }, { "epoch": 0.20431689052479113, "grad_norm": 0.31640625, "grad_norm_var": 0.009372568130493164, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.763582706451416, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19399971514940262, "step": 13683 }, { "epoch": 0.20433182269540612, "grad_norm": 0.3046875, "grad_norm_var": 0.00875992774963379, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.821350336074829, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21111777424812317, "step": 13684 }, { "epoch": 0.2043467548660211, "grad_norm": 0.322265625, "grad_norm_var": 0.008626747131347656, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.5935075283050537, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.19090213626623154, "step": 13685 }, { "epoch": 0.20436168703663607, "grad_norm": 0.3046875, "grad_norm_var": 0.008503150939941407, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.6517796516418457, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20310615748167038, "step": 13686 }, { "epoch": 0.20437661920725106, "grad_norm": 0.287109375, "grad_norm_var": 0.00863189697265625, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.712743043899536, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20373310148715973, "step": 13687 }, { "epoch": 0.20439155137786605, "grad_norm": 0.30078125, "grad_norm_var": 0.008639510472615559, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.70392107963562, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2015148550271988, "step": 13688 }, { "epoch": 0.20440648354848104, "grad_norm": 0.2890625, "grad_norm_var": 0.00848687489827474, "learning_rate": 0.0001, "loss": 1.3878, "loss/crossentropy": 2.3073174953460693, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18859164416790009, "step": 13689 }, { "epoch": 0.204421415719096, "grad_norm": 0.32421875, "grad_norm_var": 0.008463287353515625, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.5176761150360107, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.1972627341747284, "step": 13690 }, { "epoch": 0.204436347889711, "grad_norm": 0.30859375, "grad_norm_var": 0.008007542292277018, "learning_rate": 0.0001, "loss": 1.614, "loss/crossentropy": 2.155500054359436, "loss/fcd": 1.36328125, "loss/idx": 11.0, "loss/logits": 0.2507093846797943, "step": 13691 }, { "epoch": 0.20445128006032598, "grad_norm": 0.333984375, "grad_norm_var": 0.0005889256795247396, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.7605453729629517, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.16539935022592545, "step": 13692 }, { "epoch": 0.20446621223094094, "grad_norm": 0.283203125, "grad_norm_var": 0.0005460103352864583, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.5727386474609375, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19733553379774094, "step": 13693 }, { "epoch": 0.20448114440155593, "grad_norm": 0.37109375, "grad_norm_var": 0.0008138020833333334, "learning_rate": 0.0001, "loss": 1.6932, "loss/crossentropy": 2.5915719270706177, "loss/fcd": 1.4296875, "loss/idx": 11.0, "loss/logits": 0.26354295015335083, "step": 13694 }, { "epoch": 0.20449607657217092, "grad_norm": 0.275390625, "grad_norm_var": 0.0008854548136393229, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.5268622636795044, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1961166337132454, "step": 13695 }, { "epoch": 0.20451100874278588, "grad_norm": 0.298828125, "grad_norm_var": 0.0006840387980143229, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.8454318046569824, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19607748091220856, "step": 13696 }, { "epoch": 0.20452594091340087, "grad_norm": 0.37109375, "grad_norm_var": 0.0008516788482666015, "learning_rate": 0.0001, "loss": 1.386, "loss/crossentropy": 2.655847907066345, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.190644271671772, "step": 13697 }, { "epoch": 0.20454087308401586, "grad_norm": 0.328125, "grad_norm_var": 0.0007791519165039062, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.895098328590393, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21984510868787766, "step": 13698 }, { "epoch": 0.20455580525463085, "grad_norm": 0.287109375, "grad_norm_var": 0.0008223056793212891, "learning_rate": 0.0001, "loss": 1.2817, "loss/crossentropy": 2.7201255559921265, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1644735410809517, "step": 13699 }, { "epoch": 0.2045707374252458, "grad_norm": 0.318359375, "grad_norm_var": 0.0008208592732747396, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.8411372900009155, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18093254417181015, "step": 13700 }, { "epoch": 0.2045856695958608, "grad_norm": 0.37890625, "grad_norm_var": 0.001093276341756185, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.516729950904846, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21104662120342255, "step": 13701 }, { "epoch": 0.2046006017664758, "grad_norm": 0.40234375, "grad_norm_var": 0.0015383243560791015, "learning_rate": 0.0001, "loss": 1.5331, "loss/crossentropy": 2.4966115951538086, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.21673326194286346, "step": 13702 }, { "epoch": 0.20461553393709075, "grad_norm": 0.34765625, "grad_norm_var": 0.0014826456705729167, "learning_rate": 0.0001, "loss": 1.5148, "loss/crossentropy": 2.6226959228515625, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.2218671292066574, "step": 13703 }, { "epoch": 0.20463046610770574, "grad_norm": 0.306640625, "grad_norm_var": 0.0014649550120035806, "learning_rate": 0.0001, "loss": 1.3051, "loss/crossentropy": 2.761213541030884, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16833651065826416, "step": 13704 }, { "epoch": 0.20464539827832073, "grad_norm": 0.31640625, "grad_norm_var": 0.0013750553131103515, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.850572347640991, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.22016063332557678, "step": 13705 }, { "epoch": 0.20466033044893572, "grad_norm": 0.259765625, "grad_norm_var": 0.0016693115234375, "learning_rate": 0.0001, "loss": 1.3324, "loss/crossentropy": 2.489051103591919, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.18393155932426453, "step": 13706 }, { "epoch": 0.20467526261955069, "grad_norm": 0.431640625, "grad_norm_var": 0.0023592472076416015, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.7204692363739014, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19185908138751984, "step": 13707 }, { "epoch": 0.20469019479016567, "grad_norm": 0.330078125, "grad_norm_var": 0.0023591200510660808, "learning_rate": 0.0001, "loss": 1.4855, "loss/crossentropy": 2.80393648147583, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.2277122437953949, "step": 13708 }, { "epoch": 0.20470512696078066, "grad_norm": 0.3046875, "grad_norm_var": 0.0022491455078125, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.749096393585205, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.208218015730381, "step": 13709 }, { "epoch": 0.20472005913139563, "grad_norm": 0.306640625, "grad_norm_var": 0.0021814823150634764, "learning_rate": 0.0001, "loss": 1.3763, "loss/crossentropy": 2.6603517532348633, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1809902861714363, "step": 13710 }, { "epoch": 0.20473499130201062, "grad_norm": 0.310546875, "grad_norm_var": 0.0020075321197509767, "learning_rate": 0.0001, "loss": 1.4662, "loss/crossentropy": 2.5448068380355835, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.192776657640934, "step": 13711 }, { "epoch": 0.2047499234726256, "grad_norm": 0.337890625, "grad_norm_var": 0.0019344170888264975, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.616869807243347, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.21019896864891052, "step": 13712 }, { "epoch": 0.20476485564324057, "grad_norm": 0.4453125, "grad_norm_var": 0.002649545669555664, "learning_rate": 0.0001, "loss": 1.5179, "loss/crossentropy": 2.75123929977417, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.21320828050374985, "step": 13713 }, { "epoch": 0.20477978781385556, "grad_norm": 0.271484375, "grad_norm_var": 0.0029265721638997394, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.5140422582626343, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.24744442105293274, "step": 13714 }, { "epoch": 0.20479471998447055, "grad_norm": 0.294921875, "grad_norm_var": 0.0028807957967122394, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.5805402994155884, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17594684660434723, "step": 13715 }, { "epoch": 0.20480965215508554, "grad_norm": 0.2734375, "grad_norm_var": 0.0031078179677327474, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.670141577720642, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.1716003119945526, "step": 13716 }, { "epoch": 0.2048245843257005, "grad_norm": 0.3046875, "grad_norm_var": 0.0029918511708577475, "learning_rate": 0.0001, "loss": 1.3407, "loss/crossentropy": 2.2746613025665283, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1844562217593193, "step": 13717 }, { "epoch": 0.2048395164963155, "grad_norm": 0.296875, "grad_norm_var": 0.0026382287343343098, "learning_rate": 0.0001, "loss": 1.3453, "loss/crossentropy": 2.6876769065856934, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18122173845767975, "step": 13718 }, { "epoch": 0.20485444866693048, "grad_norm": 0.33984375, "grad_norm_var": 0.002614450454711914, "learning_rate": 0.0001, "loss": 1.7054, "loss/crossentropy": 2.440903425216675, "loss/fcd": 1.4296875, "loss/idx": 11.0, "loss/logits": 0.2756681740283966, "step": 13719 }, { "epoch": 0.20486938083754544, "grad_norm": 0.28125, "grad_norm_var": 0.0027022679646809896, "learning_rate": 0.0001, "loss": 1.3357, "loss/crossentropy": 2.4290037155151367, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1794106885790825, "step": 13720 }, { "epoch": 0.20488431300816043, "grad_norm": 0.34765625, "grad_norm_var": 0.0027521133422851564, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.693227171897888, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.2116699144244194, "step": 13721 }, { "epoch": 0.20489924517877542, "grad_norm": 0.27734375, "grad_norm_var": 0.0026278018951416014, "learning_rate": 0.0001, "loss": 1.3399, "loss/crossentropy": 2.5030025243759155, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1914835199713707, "step": 13722 }, { "epoch": 0.20491417734939038, "grad_norm": 0.30859375, "grad_norm_var": 0.00177764892578125, "learning_rate": 0.0001, "loss": 1.5128, "loss/crossentropy": 2.670047640800476, "loss/fcd": 1.28515625, "loss/idx": 11.0, "loss/logits": 0.2276352494955063, "step": 13723 }, { "epoch": 0.20492910952000537, "grad_norm": 0.287109375, "grad_norm_var": 0.0018035252888997397, "learning_rate": 0.0001, "loss": 1.5174, "loss/crossentropy": 2.415697932243347, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.22439561039209366, "step": 13724 }, { "epoch": 0.20494404169062036, "grad_norm": 0.314453125, "grad_norm_var": 0.0018002669016520183, "learning_rate": 0.0001, "loss": 1.4901, "loss/crossentropy": 2.440897822380066, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.23620691895484924, "step": 13725 }, { "epoch": 0.20495897386123535, "grad_norm": 0.296875, "grad_norm_var": 0.0018136978149414062, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.507148027420044, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2248963713645935, "step": 13726 }, { "epoch": 0.2049739060318503, "grad_norm": 0.291015625, "grad_norm_var": 0.0018407185872395834, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.628432035446167, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20939990878105164, "step": 13727 }, { "epoch": 0.2049888382024653, "grad_norm": 0.291015625, "grad_norm_var": 0.0018071492513020834, "learning_rate": 0.0001, "loss": 1.3927, "loss/crossentropy": 2.3616347312927246, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1739652007818222, "step": 13728 }, { "epoch": 0.2050037703730803, "grad_norm": 0.294921875, "grad_norm_var": 0.0004596551259358724, "learning_rate": 0.0001, "loss": 1.4185, "loss/crossentropy": 2.6603307723999023, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.19190050661563873, "step": 13729 }, { "epoch": 0.20501870254369525, "grad_norm": 0.36328125, "grad_norm_var": 0.0006591161092122396, "learning_rate": 0.0001, "loss": 1.5518, "loss/crossentropy": 2.8711562156677246, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.2393251210451126, "step": 13730 }, { "epoch": 0.20503363471431024, "grad_norm": 0.310546875, "grad_norm_var": 0.0006555557250976562, "learning_rate": 0.0001, "loss": 1.5774, "loss/crossentropy": 2.4802238941192627, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.24931460618972778, "step": 13731 }, { "epoch": 0.20504856688492523, "grad_norm": 0.349609375, "grad_norm_var": 0.0006983280181884766, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.368881106376648, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.2290818840265274, "step": 13732 }, { "epoch": 0.20506349905554022, "grad_norm": 0.33203125, "grad_norm_var": 0.0007268110911051432, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.6014950275421143, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17012085020542145, "step": 13733 }, { "epoch": 0.20507843122615518, "grad_norm": 0.330078125, "grad_norm_var": 0.0007314046223958333, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.5346988439559937, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.2135602831840515, "step": 13734 }, { "epoch": 0.20509336339677017, "grad_norm": 0.271484375, "grad_norm_var": 0.0007831414540608723, "learning_rate": 0.0001, "loss": 1.3296, "loss/crossentropy": 2.604446530342102, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18511120975017548, "step": 13735 }, { "epoch": 0.20510829556738516, "grad_norm": 0.318359375, "grad_norm_var": 0.00073089599609375, "learning_rate": 0.0001, "loss": 1.4228, "loss/crossentropy": 2.610516905784607, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.19229735434055328, "step": 13736 }, { "epoch": 0.20512322773800012, "grad_norm": 0.328125, "grad_norm_var": 0.0006606419881184896, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.4652795791625977, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.2376794070005417, "step": 13737 }, { "epoch": 0.20513815990861511, "grad_norm": 0.263671875, "grad_norm_var": 0.0007324059804280599, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.453436255455017, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.19347723573446274, "step": 13738 }, { "epoch": 0.2051530920792301, "grad_norm": 0.298828125, "grad_norm_var": 0.0007394790649414062, "learning_rate": 0.0001, "loss": 1.3945, "loss/crossentropy": 2.6390563249588013, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.18359283357858658, "step": 13739 }, { "epoch": 0.20516802424984507, "grad_norm": 0.341796875, "grad_norm_var": 0.0007679621378580729, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.5953335762023926, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19652892649173737, "step": 13740 }, { "epoch": 0.20518295642046006, "grad_norm": 0.291015625, "grad_norm_var": 0.000795427958170573, "learning_rate": 0.0001, "loss": 1.3581, "loss/crossentropy": 2.7292598485946655, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.19403156638145447, "step": 13741 }, { "epoch": 0.20519788859107504, "grad_norm": 0.4296875, "grad_norm_var": 0.001651446024576823, "learning_rate": 0.0001, "loss": 1.5541, "loss/crossentropy": 2.3896743059158325, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.22204849869012833, "step": 13742 }, { "epoch": 0.20521282076169003, "grad_norm": 0.3046875, "grad_norm_var": 0.001611948013305664, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6633328199386597, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.18487250059843063, "step": 13743 }, { "epoch": 0.205227752932305, "grad_norm": 0.2734375, "grad_norm_var": 0.001699066162109375, "learning_rate": 0.0001, "loss": 1.2596, "loss/crossentropy": 2.4772369861602783, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.15799403190612793, "step": 13744 }, { "epoch": 0.20524268510292, "grad_norm": 0.294921875, "grad_norm_var": 0.001699066162109375, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.5264304876327515, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.19690731167793274, "step": 13745 }, { "epoch": 0.20525761727353498, "grad_norm": 0.30859375, "grad_norm_var": 0.0015619913736979166, "learning_rate": 0.0001, "loss": 1.497, "loss/crossentropy": 2.6299304962158203, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.22751522064208984, "step": 13746 }, { "epoch": 0.20527254944414994, "grad_norm": 0.32421875, "grad_norm_var": 0.0015647729237874349, "learning_rate": 0.0001, "loss": 1.3508, "loss/crossentropy": 2.377697706222534, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.16718027740716934, "step": 13747 }, { "epoch": 0.20528748161476493, "grad_norm": 0.283203125, "grad_norm_var": 0.0015453179677327475, "learning_rate": 0.0001, "loss": 1.3754, "loss/crossentropy": 2.630317807197571, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1956753060221672, "step": 13748 }, { "epoch": 0.20530241378537992, "grad_norm": 0.298828125, "grad_norm_var": 0.0015261332194010417, "learning_rate": 0.0001, "loss": 1.4697, "loss/crossentropy": 2.709630012512207, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.2197079211473465, "step": 13749 }, { "epoch": 0.2053173459559949, "grad_norm": 0.30859375, "grad_norm_var": 0.001497634251912435, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.654281735420227, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.19428492337465286, "step": 13750 }, { "epoch": 0.20533227812660987, "grad_norm": 0.283203125, "grad_norm_var": 0.00144804318745931, "learning_rate": 0.0001, "loss": 1.2816, "loss/crossentropy": 2.6865965127944946, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.1644015982747078, "step": 13751 }, { "epoch": 0.20534721029722486, "grad_norm": 0.271484375, "grad_norm_var": 0.001529677708943685, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.509407639503479, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.1916719451546669, "step": 13752 }, { "epoch": 0.20536214246783985, "grad_norm": 0.2890625, "grad_norm_var": 0.0015125115712483724, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.6175296306610107, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18882440030574799, "step": 13753 }, { "epoch": 0.2053770746384548, "grad_norm": 0.296875, "grad_norm_var": 0.0014025370279947917, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.535476565361023, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.19967928528785706, "step": 13754 }, { "epoch": 0.2053920068090698, "grad_norm": 0.392578125, "grad_norm_var": 0.0018603006998697917, "learning_rate": 0.0001, "loss": 1.7366, "loss/crossentropy": 2.8440312147140503, "loss/fcd": 1.48828125, "loss/idx": 11.0, "loss/logits": 0.24835912883281708, "step": 13755 }, { "epoch": 0.2054069389796848, "grad_norm": 0.296875, "grad_norm_var": 0.001808023452758789, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.6966320276260376, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19654294103384018, "step": 13756 }, { "epoch": 0.20542187115029975, "grad_norm": 0.271484375, "grad_norm_var": 0.0018792311350504557, "learning_rate": 0.0001, "loss": 1.2622, "loss/crossentropy": 2.682481050491333, "loss/fcd": 1.1015625, "loss/idx": 11.0, "loss/logits": 0.16062641143798828, "step": 13757 }, { "epoch": 0.20543680332091474, "grad_norm": 0.3046875, "grad_norm_var": 0.0008273919423421224, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.196303427219391, "loss/fcd": 1.1171875, "loss/idx": 11.0, "loss/logits": 0.12935146689414978, "step": 13758 }, { "epoch": 0.20545173549152973, "grad_norm": 0.296875, "grad_norm_var": 0.0008265018463134765, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.582674741744995, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.17085795104503632, "step": 13759 }, { "epoch": 0.20546666766214472, "grad_norm": 0.302734375, "grad_norm_var": 0.0007776260375976562, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.747452974319458, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17919252067804337, "step": 13760 }, { "epoch": 0.20548159983275968, "grad_norm": 0.26953125, "grad_norm_var": 0.0008402347564697265, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.551114797592163, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.19897355139255524, "step": 13761 }, { "epoch": 0.20549653200337467, "grad_norm": 0.357421875, "grad_norm_var": 0.001045672098795573, "learning_rate": 0.0001, "loss": 1.6563, "loss/crossentropy": 2.6125388145446777, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2461738884449005, "step": 13762 }, { "epoch": 0.20551146417398966, "grad_norm": 0.298828125, "grad_norm_var": 0.0010140577952067057, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.751951217651367, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.19186419248580933, "step": 13763 }, { "epoch": 0.20552639634460462, "grad_norm": 0.29296875, "grad_norm_var": 0.0009963353474934896, "learning_rate": 0.0001, "loss": 1.2258, "loss/crossentropy": 2.882006525993347, "loss/fcd": 1.07421875, "loss/idx": 11.0, "loss/logits": 0.15160006284713745, "step": 13764 }, { "epoch": 0.2055413285152196, "grad_norm": 0.263671875, "grad_norm_var": 0.0010884602864583333, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.6355305910110474, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.17604807019233704, "step": 13765 }, { "epoch": 0.2055562606858346, "grad_norm": 0.314453125, "grad_norm_var": 0.0010974725087483724, "learning_rate": 0.0001, "loss": 1.4029, "loss/crossentropy": 2.5249658823013306, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1880628913640976, "step": 13766 }, { "epoch": 0.2055711928564496, "grad_norm": 0.330078125, "grad_norm_var": 0.0011287530263264973, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.4359482526779175, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19568227976560593, "step": 13767 }, { "epoch": 0.20558612502706455, "grad_norm": 0.27734375, "grad_norm_var": 0.0011061986287434896, "learning_rate": 0.0001, "loss": 1.3918, "loss/crossentropy": 2.468021273612976, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19254738837480545, "step": 13768 }, { "epoch": 0.20560105719767954, "grad_norm": 0.287109375, "grad_norm_var": 0.0011101881663004556, "learning_rate": 0.0001, "loss": 1.4128, "loss/crossentropy": 2.57930326461792, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20182421803474426, "step": 13769 }, { "epoch": 0.20561598936829453, "grad_norm": 0.3203125, "grad_norm_var": 0.0011243025461832683, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.6852883100509644, "loss/fcd": 1.32421875, "loss/idx": 11.0, "loss/logits": 0.23322053998708725, "step": 13770 }, { "epoch": 0.2056309215389095, "grad_norm": 0.322265625, "grad_norm_var": 0.0006104628245035807, "learning_rate": 0.0001, "loss": 1.429, "loss/crossentropy": 2.695153832435608, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19459787011146545, "step": 13771 }, { "epoch": 0.20564585370952448, "grad_norm": 0.296875, "grad_norm_var": 0.0006104628245035807, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.900929093360901, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.21172333508729935, "step": 13772 }, { "epoch": 0.20566078588013947, "grad_norm": 0.294921875, "grad_norm_var": 0.0005543867746988933, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.7505217790603638, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2311450093984604, "step": 13773 }, { "epoch": 0.20567571805075444, "grad_norm": 0.35546875, "grad_norm_var": 0.0007345676422119141, "learning_rate": 0.0001, "loss": 1.4707, "loss/crossentropy": 2.6933157444000244, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19729499518871307, "step": 13774 }, { "epoch": 0.20569065022136943, "grad_norm": 0.283203125, "grad_norm_var": 0.0007611592610677083, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.757527232170105, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.19115466624498367, "step": 13775 }, { "epoch": 0.20570558239198442, "grad_norm": 0.263671875, "grad_norm_var": 0.0008641560872395833, "learning_rate": 0.0001, "loss": 1.3108, "loss/crossentropy": 2.6344486474990845, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1780264750123024, "step": 13776 }, { "epoch": 0.2057205145625994, "grad_norm": 0.259765625, "grad_norm_var": 0.0009120782216389974, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.54026460647583, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.16218256950378418, "step": 13777 }, { "epoch": 0.20573544673321437, "grad_norm": 0.271484375, "grad_norm_var": 0.0007288455963134766, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.531848192214966, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.2011253908276558, "step": 13778 }, { "epoch": 0.20575037890382936, "grad_norm": 0.30859375, "grad_norm_var": 0.0007387797037760416, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.6001261472702026, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.2034313902258873, "step": 13779 }, { "epoch": 0.20576531107444435, "grad_norm": 0.34375, "grad_norm_var": 0.0008768081665039062, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.7184406518936157, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.2235717475414276, "step": 13780 }, { "epoch": 0.2057802432450593, "grad_norm": 0.26953125, "grad_norm_var": 0.0008509159088134766, "learning_rate": 0.0001, "loss": 1.35, "loss/crossentropy": 2.6708784103393555, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.1820303350687027, "step": 13781 }, { "epoch": 0.2057951754156743, "grad_norm": 0.322265625, "grad_norm_var": 0.0008698622385660807, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.2707005739212036, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.22550755739212036, "step": 13782 }, { "epoch": 0.2058101075862893, "grad_norm": 0.4296875, "grad_norm_var": 0.0018839518229166667, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.395079016685486, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.2238389551639557, "step": 13783 }, { "epoch": 0.20582503975690425, "grad_norm": 0.353515625, "grad_norm_var": 0.0019490400950113933, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.632541060447693, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.1815354898571968, "step": 13784 }, { "epoch": 0.20583997192751924, "grad_norm": 0.263671875, "grad_norm_var": 0.0020592848459879557, "learning_rate": 0.0001, "loss": 1.2409, "loss/crossentropy": 2.492693066596985, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.1549898162484169, "step": 13785 }, { "epoch": 0.20585490409813423, "grad_norm": 0.349609375, "grad_norm_var": 0.002153460184733073, "learning_rate": 0.0001, "loss": 1.5483, "loss/crossentropy": 2.624978542327881, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.21624627709388733, "step": 13786 }, { "epoch": 0.20586983626874922, "grad_norm": 0.33203125, "grad_norm_var": 0.0021730899810791016, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.91092312335968, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1821102276444435, "step": 13787 }, { "epoch": 0.20588476843936418, "grad_norm": 0.291015625, "grad_norm_var": 0.002187347412109375, "learning_rate": 0.0001, "loss": 1.5043, "loss/crossentropy": 2.4056921005249023, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.21128685772418976, "step": 13788 }, { "epoch": 0.20589970060997917, "grad_norm": 0.306640625, "grad_norm_var": 0.002169227600097656, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.6731902360916138, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2043168768286705, "step": 13789 }, { "epoch": 0.20591463278059416, "grad_norm": 0.3515625, "grad_norm_var": 0.002147928873697917, "learning_rate": 0.0001, "loss": 1.4951, "loss/crossentropy": 2.907194495201111, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.23334041982889175, "step": 13790 }, { "epoch": 0.20592956495120912, "grad_norm": 0.30078125, "grad_norm_var": 0.0020985762278238934, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.7221431732177734, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20074747502803802, "step": 13791 }, { "epoch": 0.2059444971218241, "grad_norm": 0.291015625, "grad_norm_var": 0.0019632816314697266, "learning_rate": 0.0001, "loss": 1.304, "loss/crossentropy": 2.598964214324951, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.1712278425693512, "step": 13792 }, { "epoch": 0.2059594292924391, "grad_norm": 0.333984375, "grad_norm_var": 0.0017579237620035807, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.5220446586608887, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18435434997081757, "step": 13793 }, { "epoch": 0.2059743614630541, "grad_norm": 0.35546875, "grad_norm_var": 0.0016560872395833333, "learning_rate": 0.0001, "loss": 1.4854, "loss/crossentropy": 2.659604072570801, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.20419736951589584, "step": 13794 }, { "epoch": 0.20598929363366905, "grad_norm": 0.306640625, "grad_norm_var": 0.0016606489817301432, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.589516520500183, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.1931699961423874, "step": 13795 }, { "epoch": 0.20600422580428404, "grad_norm": 0.318359375, "grad_norm_var": 0.001637713114420573, "learning_rate": 0.0001, "loss": 1.6684, "loss/crossentropy": 2.369366765022278, "loss/fcd": 1.43359375, "loss/idx": 11.0, "loss/logits": 0.23484385758638382, "step": 13796 }, { "epoch": 0.20601915797489903, "grad_norm": 0.36328125, "grad_norm_var": 0.0015125910441080729, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.4087910652160645, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.19106734544038773, "step": 13797 }, { "epoch": 0.206034090145514, "grad_norm": 0.33203125, "grad_norm_var": 0.0015093326568603516, "learning_rate": 0.0001, "loss": 1.656, "loss/crossentropy": 2.6059446334838867, "loss/fcd": 1.41015625, "loss/idx": 11.0, "loss/logits": 0.2458852156996727, "step": 13798 }, { "epoch": 0.20604902231612898, "grad_norm": 0.287109375, "grad_norm_var": 0.0008839289347330729, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.805967926979065, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19738579541444778, "step": 13799 }, { "epoch": 0.20606395448674397, "grad_norm": 0.349609375, "grad_norm_var": 0.0008679707845052083, "learning_rate": 0.0001, "loss": 1.4598, "loss/crossentropy": 2.888966917991638, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.2566451132297516, "step": 13800 }, { "epoch": 0.20607888665735893, "grad_norm": 0.294921875, "grad_norm_var": 0.0006909688313802083, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.4734702110290527, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.17588671296834946, "step": 13801 }, { "epoch": 0.20609381882797392, "grad_norm": 0.4609375, "grad_norm_var": 0.0018642266591389974, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.4736703634262085, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.2035760059952736, "step": 13802 }, { "epoch": 0.2061087509985889, "grad_norm": 0.2734375, "grad_norm_var": 0.002060683568318685, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.683105230331421, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.17911387979984283, "step": 13803 }, { "epoch": 0.2061236831692039, "grad_norm": 0.28515625, "grad_norm_var": 0.002090199788411458, "learning_rate": 0.0001, "loss": 1.377, "loss/crossentropy": 2.792698621749878, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.18953806161880493, "step": 13804 }, { "epoch": 0.20613861533981886, "grad_norm": 0.2890625, "grad_norm_var": 0.0021541436513264974, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.698320746421814, "loss/fcd": 1.34765625, "loss/idx": 11.0, "loss/logits": 0.25996437668800354, "step": 13805 }, { "epoch": 0.20615354751043385, "grad_norm": 0.302734375, "grad_norm_var": 0.002127520243326823, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.624877452850342, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.2005239725112915, "step": 13806 }, { "epoch": 0.20616847968104884, "grad_norm": 0.486328125, "grad_norm_var": 0.00376585324605306, "learning_rate": 0.0001, "loss": 1.7297, "loss/crossentropy": 2.2765358686447144, "loss/fcd": 1.453125, "loss/idx": 11.0, "loss/logits": 0.2765398472547531, "step": 13807 }, { "epoch": 0.2061834118516638, "grad_norm": 0.3046875, "grad_norm_var": 0.0037007649739583333, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.715426802635193, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.18046005070209503, "step": 13808 }, { "epoch": 0.2061983440222788, "grad_norm": 0.30078125, "grad_norm_var": 0.003769667943318685, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.726868152618408, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.20715581625699997, "step": 13809 }, { "epoch": 0.20621327619289379, "grad_norm": 0.375, "grad_norm_var": 0.0038548628489176433, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.8928219079971313, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1717967540025711, "step": 13810 }, { "epoch": 0.20622820836350877, "grad_norm": 0.337890625, "grad_norm_var": 0.0038055260976155598, "learning_rate": 0.0001, "loss": 1.4271, "loss/crossentropy": 2.6227307319641113, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19270025193691254, "step": 13811 }, { "epoch": 0.20624314053412374, "grad_norm": 0.353515625, "grad_norm_var": 0.0038043816884358725, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.3444236516952515, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.1795809492468834, "step": 13812 }, { "epoch": 0.20625807270473873, "grad_norm": 0.287109375, "grad_norm_var": 0.0039029439290364583, "learning_rate": 0.0001, "loss": 1.4464, "loss/crossentropy": 2.5774112939834595, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.20027023553848267, "step": 13813 }, { "epoch": 0.20627300487535372, "grad_norm": 0.291015625, "grad_norm_var": 0.004010756810506185, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.7012414932250977, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.1736379712820053, "step": 13814 }, { "epoch": 0.20628793704596868, "grad_norm": 0.296875, "grad_norm_var": 0.003960927327473958, "learning_rate": 0.0001, "loss": 1.3828, "loss/crossentropy": 2.7416027784347534, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1953132450580597, "step": 13815 }, { "epoch": 0.20630286921658367, "grad_norm": 0.44140625, "grad_norm_var": 0.004720671971638998, "learning_rate": 0.0001, "loss": 1.6244, "loss/crossentropy": 2.770040273666382, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.23771753162145615, "step": 13816 }, { "epoch": 0.20631780138719866, "grad_norm": 0.279296875, "grad_norm_var": 0.004822142918904622, "learning_rate": 0.0001, "loss": 1.3274, "loss/crossentropy": 2.6397839784622192, "loss/fcd": 1.14453125, "loss/idx": 11.0, "loss/logits": 0.18290266394615173, "step": 13817 }, { "epoch": 0.20633273355781362, "grad_norm": 0.298828125, "grad_norm_var": 0.003749593098958333, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.64525043964386, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18839503824710846, "step": 13818 }, { "epoch": 0.2063476657284286, "grad_norm": 0.310546875, "grad_norm_var": 0.003579568862915039, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.67670738697052, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20616412907838821, "step": 13819 }, { "epoch": 0.2063625978990436, "grad_norm": 0.306640625, "grad_norm_var": 0.003487078348795573, "learning_rate": 0.0001, "loss": 1.4405, "loss/crossentropy": 2.4594212770462036, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.194381944835186, "step": 13820 }, { "epoch": 0.2063775300696586, "grad_norm": 0.314453125, "grad_norm_var": 0.003392648696899414, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.5537678003311157, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1861693188548088, "step": 13821 }, { "epoch": 0.20639246224027355, "grad_norm": 0.306640625, "grad_norm_var": 0.0033791700998942057, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.621810793876648, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.23958395421504974, "step": 13822 }, { "epoch": 0.20640739441088854, "grad_norm": 0.2490234375, "grad_norm_var": 0.0019742290178934732, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.4656083583831787, "loss/fcd": 1.078125, "loss/idx": 11.0, "loss/logits": 0.16096153110265732, "step": 13823 }, { "epoch": 0.20642232658150353, "grad_norm": 0.322265625, "grad_norm_var": 0.0019673625628153484, "learning_rate": 0.0001, "loss": 1.5353, "loss/crossentropy": 2.6665170192718506, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.25403837114572525, "step": 13824 }, { "epoch": 0.2064372587521185, "grad_norm": 0.32421875, "grad_norm_var": 0.001951150099436442, "learning_rate": 0.0001, "loss": 1.4818, "loss/crossentropy": 2.5007524490356445, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.20838695019483566, "step": 13825 }, { "epoch": 0.20645219092273348, "grad_norm": 0.330078125, "grad_norm_var": 0.0017383853594462077, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.860469698905945, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21002965420484543, "step": 13826 }, { "epoch": 0.20646712309334847, "grad_norm": 0.29296875, "grad_norm_var": 0.0017310738563537597, "learning_rate": 0.0001, "loss": 1.3566, "loss/crossentropy": 2.5192079544067383, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.18080420792102814, "step": 13827 }, { "epoch": 0.20648205526396346, "grad_norm": 0.37109375, "grad_norm_var": 0.0018458008766174317, "learning_rate": 0.0001, "loss": 1.5248, "loss/crossentropy": 2.916916847229004, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.21227120608091354, "step": 13828 }, { "epoch": 0.20649698743457842, "grad_norm": 0.2890625, "grad_norm_var": 0.0018390615781148274, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.8081746101379395, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.19696970283985138, "step": 13829 }, { "epoch": 0.2065119196051934, "grad_norm": 0.298828125, "grad_norm_var": 0.0018189072608947754, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.5884902477264404, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.192212276160717, "step": 13830 }, { "epoch": 0.2065268517758084, "grad_norm": 0.279296875, "grad_norm_var": 0.0018795609474182128, "learning_rate": 0.0001, "loss": 1.3879, "loss/crossentropy": 2.7160561084747314, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.20821189135313034, "step": 13831 }, { "epoch": 0.20654178394642336, "grad_norm": 0.3515625, "grad_norm_var": 0.000850832462310791, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 2.993056297302246, "loss/fcd": 1.3984375, "loss/idx": 11.0, "loss/logits": 0.25808387994766235, "step": 13832 }, { "epoch": 0.20655671611703835, "grad_norm": 0.306640625, "grad_norm_var": 0.0007936437924702962, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.7104631662368774, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20514635741710663, "step": 13833 }, { "epoch": 0.20657164828765334, "grad_norm": 0.34375, "grad_norm_var": 0.0008557915687561035, "learning_rate": 0.0001, "loss": 1.4769, "loss/crossentropy": 2.8102599382400513, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21126855164766312, "step": 13834 }, { "epoch": 0.2065865804582683, "grad_norm": 0.287109375, "grad_norm_var": 0.0008956551551818848, "learning_rate": 0.0001, "loss": 1.414, "loss/crossentropy": 2.270321488380432, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19913746416568756, "step": 13835 }, { "epoch": 0.2066015126288833, "grad_norm": 0.302734375, "grad_norm_var": 0.0008988022804260254, "learning_rate": 0.0001, "loss": 1.6396, "loss/crossentropy": 2.3559476137161255, "loss/fcd": 1.359375, "loss/idx": 11.0, "loss/logits": 0.2801807075738907, "step": 13836 }, { "epoch": 0.20661644479949828, "grad_norm": 0.265625, "grad_norm_var": 0.001022779941558838, "learning_rate": 0.0001, "loss": 1.3302, "loss/crossentropy": 2.447188377380371, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.1700674444437027, "step": 13837 }, { "epoch": 0.20663137697011327, "grad_norm": 0.251953125, "grad_norm_var": 0.0012163758277893067, "learning_rate": 0.0001, "loss": 1.276, "loss/crossentropy": 2.2483080625534058, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.15486137568950653, "step": 13838 }, { "epoch": 0.20664630914072823, "grad_norm": 0.265625, "grad_norm_var": 0.001111602783203125, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.5568283796310425, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1670057624578476, "step": 13839 }, { "epoch": 0.20666124131134322, "grad_norm": 0.3046875, "grad_norm_var": 0.001090860366821289, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.66115140914917, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19950693100690842, "step": 13840 }, { "epoch": 0.20667617348195821, "grad_norm": 0.28125, "grad_norm_var": 0.001090860366821289, "learning_rate": 0.0001, "loss": 1.2857, "loss/crossentropy": 2.6254615783691406, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.1762775033712387, "step": 13841 }, { "epoch": 0.20669110565257318, "grad_norm": 0.28125, "grad_norm_var": 0.0010531107584635417, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.5339382886886597, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19782868772745132, "step": 13842 }, { "epoch": 0.20670603782318817, "grad_norm": 0.302734375, "grad_norm_var": 0.001052077611287435, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 2.4241950511932373, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.1909923180937767, "step": 13843 }, { "epoch": 0.20672096999380316, "grad_norm": 0.296875, "grad_norm_var": 0.0006824334462483724, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.7168638706207275, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.2107982039451599, "step": 13844 }, { "epoch": 0.20673590216441812, "grad_norm": 0.390625, "grad_norm_var": 0.0012560367584228515, "learning_rate": 0.0001, "loss": 1.624, "loss/crossentropy": 2.4322097301483154, "loss/fcd": 1.3828125, "loss/idx": 11.0, "loss/logits": 0.241146981716156, "step": 13845 }, { "epoch": 0.2067508343350331, "grad_norm": 0.2734375, "grad_norm_var": 0.0013025283813476562, "learning_rate": 0.0001, "loss": 1.422, "loss/crossentropy": 2.5769253969192505, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.20329899340867996, "step": 13846 }, { "epoch": 0.2067657665056481, "grad_norm": 0.9765625, "grad_norm_var": 0.029850244522094727, "learning_rate": 0.0001, "loss": 1.9288, "loss/crossentropy": 2.480228304862976, "loss/fcd": 1.59765625, "loss/idx": 11.0, "loss/logits": 0.33112482726573944, "step": 13847 }, { "epoch": 0.20678069867626309, "grad_norm": 0.30859375, "grad_norm_var": 0.02991458574930827, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.7650054693222046, "loss/fcd": 1.12109375, "loss/idx": 11.0, "loss/logits": 0.17076723277568817, "step": 13848 }, { "epoch": 0.20679563084687805, "grad_norm": 0.3046875, "grad_norm_var": 0.029923502604166666, "learning_rate": 0.0001, "loss": 1.5276, "loss/crossentropy": 2.9218796491622925, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2463454008102417, "step": 13849 }, { "epoch": 0.20681056301749304, "grad_norm": 0.3359375, "grad_norm_var": 0.029923248291015624, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.4775441884994507, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19545284658670425, "step": 13850 }, { "epoch": 0.20682549518810803, "grad_norm": 0.4140625, "grad_norm_var": 0.03004619280497233, "learning_rate": 0.0001, "loss": 1.5774, "loss/crossentropy": 3.044363498687744, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23756028711795807, "step": 13851 }, { "epoch": 0.206840427358723, "grad_norm": 0.306640625, "grad_norm_var": 0.030023940404256187, "learning_rate": 0.0001, "loss": 1.4031, "loss/crossentropy": 2.7190335988998413, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19611599296331406, "step": 13852 }, { "epoch": 0.20685535952933798, "grad_norm": 0.314453125, "grad_norm_var": 0.029639689127604167, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.3652888536453247, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19378089904785156, "step": 13853 }, { "epoch": 0.20687029169995297, "grad_norm": 0.302734375, "grad_norm_var": 0.029133033752441407, "learning_rate": 0.0001, "loss": 1.366, "loss/crossentropy": 2.456057071685791, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.1784963756799698, "step": 13854 }, { "epoch": 0.20688522387056796, "grad_norm": 0.28515625, "grad_norm_var": 0.028927357991536458, "learning_rate": 0.0001, "loss": 1.3587, "loss/crossentropy": 2.625131368637085, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.182881198823452, "step": 13855 }, { "epoch": 0.20690015604118292, "grad_norm": 0.310546875, "grad_norm_var": 0.028890212376912434, "learning_rate": 0.0001, "loss": 1.4581, "loss/crossentropy": 2.4621479511260986, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.21593015640974045, "step": 13856 }, { "epoch": 0.2069150882117979, "grad_norm": 0.3515625, "grad_norm_var": 0.02850454648335775, "learning_rate": 0.0001, "loss": 1.5493, "loss/crossentropy": 2.5032119750976562, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.24855617433786392, "step": 13857 }, { "epoch": 0.2069300203824129, "grad_norm": 0.34765625, "grad_norm_var": 0.02808518409729004, "learning_rate": 0.0001, "loss": 1.5585, "loss/crossentropy": 2.497425079345703, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.21471552550792694, "step": 13858 }, { "epoch": 0.20694495255302786, "grad_norm": 0.298828125, "grad_norm_var": 0.028117990493774413, "learning_rate": 0.0001, "loss": 1.3729, "loss/crossentropy": 2.777335524559021, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.18150099366903305, "step": 13859 }, { "epoch": 0.20695988472364285, "grad_norm": 0.3203125, "grad_norm_var": 0.02794365882873535, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.514986991882324, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2170066237449646, "step": 13860 }, { "epoch": 0.20697481689425784, "grad_norm": 0.310546875, "grad_norm_var": 0.028072039286295574, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.6647058725357056, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.21768490225076675, "step": 13861 }, { "epoch": 0.2069897490648728, "grad_norm": 0.302734375, "grad_norm_var": 0.027787129084269207, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.3869296312332153, "loss/fcd": 1.25390625, "loss/idx": 11.0, "loss/logits": 0.18045629560947418, "step": 13862 }, { "epoch": 0.2070046812354878, "grad_norm": 0.31640625, "grad_norm_var": 0.0009253025054931641, "learning_rate": 0.0001, "loss": 1.5085, "loss/crossentropy": 2.464542508125305, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.2311454862356186, "step": 13863 }, { "epoch": 0.20701961340610278, "grad_norm": 0.31640625, "grad_norm_var": 0.0009165287017822265, "learning_rate": 0.0001, "loss": 1.3936, "loss/crossentropy": 2.597678542137146, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.20613113790750504, "step": 13864 }, { "epoch": 0.20703454557671777, "grad_norm": 0.337890625, "grad_norm_var": 0.0009124755859375, "learning_rate": 0.0001, "loss": 1.6506, "loss/crossentropy": 2.317377984523773, "loss/fcd": 1.40234375, "loss/idx": 11.0, "loss/logits": 0.24822499603033066, "step": 13865 }, { "epoch": 0.20704947774733273, "grad_norm": 0.478515625, "grad_norm_var": 0.0024243513743082683, "learning_rate": 0.0001, "loss": 1.5238, "loss/crossentropy": 2.5122352838516235, "loss/fcd": 1.33203125, "loss/idx": 11.0, "loss/logits": 0.1917285919189453, "step": 13866 }, { "epoch": 0.20706440991794772, "grad_norm": 0.30859375, "grad_norm_var": 0.0019677321116129557, "learning_rate": 0.0001, "loss": 1.3086, "loss/crossentropy": 2.746204972267151, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17188720405101776, "step": 13867 }, { "epoch": 0.2070793420885627, "grad_norm": 0.271484375, "grad_norm_var": 0.002133671442667643, "learning_rate": 0.0001, "loss": 1.349, "loss/crossentropy": 2.655271291732788, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.19277872145175934, "step": 13868 }, { "epoch": 0.20709427425917767, "grad_norm": 0.28125, "grad_norm_var": 0.0022420247395833334, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.5114349126815796, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1961049810051918, "step": 13869 }, { "epoch": 0.20710920642979266, "grad_norm": 0.30078125, "grad_norm_var": 0.0022470951080322266, "learning_rate": 0.0001, "loss": 1.5027, "loss/crossentropy": 2.422927141189575, "loss/fcd": 1.2890625, "loss/idx": 11.0, "loss/logits": 0.21367865800857544, "step": 13870 }, { "epoch": 0.20712413860040765, "grad_norm": 0.314453125, "grad_norm_var": 0.0021600723266601562, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.474276542663574, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.17953462898731232, "step": 13871 }, { "epoch": 0.20713907077102264, "grad_norm": 0.328125, "grad_norm_var": 0.0021502017974853516, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.4703822135925293, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2036399468779564, "step": 13872 }, { "epoch": 0.2071540029416376, "grad_norm": 0.40234375, "grad_norm_var": 0.0024973392486572266, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.3338793516159058, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.2213374376296997, "step": 13873 }, { "epoch": 0.2071689351122526, "grad_norm": 0.408203125, "grad_norm_var": 0.0028910319010416667, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.615304946899414, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.21077965945005417, "step": 13874 }, { "epoch": 0.20718386728286758, "grad_norm": 0.283203125, "grad_norm_var": 0.0029734293619791668, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.6956586837768555, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.17162546515464783, "step": 13875 }, { "epoch": 0.20719879945348255, "grad_norm": 0.337890625, "grad_norm_var": 0.0029698530832926433, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.511268734931946, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.17780324816703796, "step": 13876 }, { "epoch": 0.20721373162409754, "grad_norm": 0.353515625, "grad_norm_var": 0.002967055638631185, "learning_rate": 0.0001, "loss": 1.6908, "loss/crossentropy": 2.3102599382400513, "loss/fcd": 1.4453125, "loss/idx": 11.0, "loss/logits": 0.24552126973867416, "step": 13877 }, { "epoch": 0.20722866379471253, "grad_norm": 0.30078125, "grad_norm_var": 0.0029754002888997396, "learning_rate": 0.0001, "loss": 1.3207, "loss/crossentropy": 2.6181275844573975, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.18003857880830765, "step": 13878 }, { "epoch": 0.2072435959653275, "grad_norm": 0.30859375, "grad_norm_var": 0.002997271219889323, "learning_rate": 0.0001, "loss": 1.347, "loss/crossentropy": 2.545871138572693, "loss/fcd": 1.1640625, "loss/idx": 11.0, "loss/logits": 0.18292221426963806, "step": 13879 }, { "epoch": 0.20725852813594248, "grad_norm": 0.318359375, "grad_norm_var": 0.002993122736612956, "learning_rate": 0.0001, "loss": 1.5184, "loss/crossentropy": 2.6112974882125854, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.24500729143619537, "step": 13880 }, { "epoch": 0.20727346030655747, "grad_norm": 0.26171875, "grad_norm_var": 0.0033098856608072915, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.553151249885559, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20213107764720917, "step": 13881 }, { "epoch": 0.20728839247717246, "grad_norm": 0.29296875, "grad_norm_var": 0.001753091812133789, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.336093783378601, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.19319364428520203, "step": 13882 }, { "epoch": 0.20730332464778742, "grad_norm": 0.30859375, "grad_norm_var": 0.001753091812133789, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.6098453998565674, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.16893861442804337, "step": 13883 }, { "epoch": 0.2073182568184024, "grad_norm": 0.275390625, "grad_norm_var": 0.00173033078511556, "learning_rate": 0.0001, "loss": 1.3251, "loss/crossentropy": 2.410228967666626, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1766752153635025, "step": 13884 }, { "epoch": 0.2073331889890174, "grad_norm": 0.271484375, "grad_norm_var": 0.0017831802368164062, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.543544292449951, "loss/fcd": 1.234375, "loss/idx": 11.0, "loss/logits": 0.21949228644371033, "step": 13885 }, { "epoch": 0.20734812115963236, "grad_norm": 0.32421875, "grad_norm_var": 0.0017679214477539062, "learning_rate": 0.0001, "loss": 1.4827, "loss/crossentropy": 2.561304450035095, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.20536305010318756, "step": 13886 }, { "epoch": 0.20736305333024735, "grad_norm": 0.3515625, "grad_norm_var": 0.0018358707427978515, "learning_rate": 0.0001, "loss": 1.275, "loss/crossentropy": 2.9456288814544678, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.1656743735074997, "step": 13887 }, { "epoch": 0.20737798550086234, "grad_norm": 0.306640625, "grad_norm_var": 0.0018426895141601563, "learning_rate": 0.0001, "loss": 1.4151, "loss/crossentropy": 2.524316668510437, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.1963498592376709, "step": 13888 }, { "epoch": 0.20739291767147733, "grad_norm": 0.29296875, "grad_norm_var": 0.0013762791951497397, "learning_rate": 0.0001, "loss": 1.3143, "loss/crossentropy": 2.4252926111221313, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.1737166941165924, "step": 13889 }, { "epoch": 0.2074078498420923, "grad_norm": 0.318359375, "grad_norm_var": 0.0007314046223958333, "learning_rate": 0.0001, "loss": 1.4158, "loss/crossentropy": 2.2194761633872986, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.17752201110124588, "step": 13890 }, { "epoch": 0.20742278201270728, "grad_norm": 0.33203125, "grad_norm_var": 0.0007278283437093098, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.336857795715332, "loss/fcd": 1.33984375, "loss/idx": 11.0, "loss/logits": 0.23685450106859207, "step": 13891 }, { "epoch": 0.20743771418332227, "grad_norm": 0.255859375, "grad_norm_var": 0.0008399804433186849, "learning_rate": 0.0001, "loss": 1.2654, "loss/crossentropy": 2.4705175161361694, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.15211894363164902, "step": 13892 }, { "epoch": 0.20745264635393723, "grad_norm": 0.3984375, "grad_norm_var": 0.001259295145670573, "learning_rate": 0.0001, "loss": 1.6626, "loss/crossentropy": 2.6157805919647217, "loss/fcd": 1.42578125, "loss/idx": 11.0, "loss/logits": 0.2368294820189476, "step": 13893 }, { "epoch": 0.20746757852455222, "grad_norm": 0.322265625, "grad_norm_var": 0.0012692610422770183, "learning_rate": 0.0001, "loss": 1.4492, "loss/crossentropy": 2.8168740272521973, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.19919200986623764, "step": 13894 }, { "epoch": 0.2074825106951672, "grad_norm": 0.29296875, "grad_norm_var": 0.00128477414449056, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.576037883758545, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1683856099843979, "step": 13895 }, { "epoch": 0.20749744286578217, "grad_norm": 0.330078125, "grad_norm_var": 0.0013099511464436848, "learning_rate": 0.0001, "loss": 1.385, "loss/crossentropy": 2.540294051170349, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1897255778312683, "step": 13896 }, { "epoch": 0.20751237503639716, "grad_norm": 0.3125, "grad_norm_var": 0.0011545658111572266, "learning_rate": 0.0001, "loss": 1.5152, "loss/crossentropy": 2.5531375408172607, "loss/fcd": 1.30078125, "loss/idx": 11.0, "loss/logits": 0.21442456543445587, "step": 13897 }, { "epoch": 0.20752730720701215, "grad_norm": 0.28125, "grad_norm_var": 0.001192331314086914, "learning_rate": 0.0001, "loss": 1.3818, "loss/crossentropy": 2.764095664024353, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19434629380702972, "step": 13898 }, { "epoch": 0.20754223937762714, "grad_norm": 0.322265625, "grad_norm_var": 0.0011997858683268229, "learning_rate": 0.0001, "loss": 1.4072, "loss/crossentropy": 2.395478367805481, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.18450797349214554, "step": 13899 }, { "epoch": 0.2075571715482421, "grad_norm": 0.3046875, "grad_norm_var": 0.0011113325754801432, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.8399860858917236, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.20809800922870636, "step": 13900 }, { "epoch": 0.2075721037188571, "grad_norm": 0.28125, "grad_norm_var": 0.0010624567667643229, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.7473371028900146, "loss/fcd": 1.17578125, "loss/idx": 11.0, "loss/logits": 0.17344088107347488, "step": 13901 }, { "epoch": 0.20758703588947208, "grad_norm": 0.306640625, "grad_norm_var": 0.0010583082834879558, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.5268962383270264, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.2039022594690323, "step": 13902 }, { "epoch": 0.20760196806008704, "grad_norm": 0.27734375, "grad_norm_var": 0.0010220686594645183, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.711200475692749, "loss/fcd": 1.11328125, "loss/idx": 11.0, "loss/logits": 0.18613114953041077, "step": 13903 }, { "epoch": 0.20761690023070203, "grad_norm": 0.33203125, "grad_norm_var": 0.0010561625162760417, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.8980140686035156, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.218988835811615, "step": 13904 }, { "epoch": 0.20763183240131702, "grad_norm": 0.365234375, "grad_norm_var": 0.0012178897857666015, "learning_rate": 0.0001, "loss": 1.4723, "loss/crossentropy": 2.702457904815674, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.19495484232902527, "step": 13905 }, { "epoch": 0.20764676457193199, "grad_norm": 0.30078125, "grad_norm_var": 0.00122833251953125, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.5674792528152466, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.20335353165864944, "step": 13906 }, { "epoch": 0.20766169674254698, "grad_norm": 0.263671875, "grad_norm_var": 0.0013512770334879558, "learning_rate": 0.0001, "loss": 1.3571, "loss/crossentropy": 2.5254944562911987, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.19696763157844543, "step": 13907 }, { "epoch": 0.20767662891316196, "grad_norm": 0.337890625, "grad_norm_var": 0.0011883894602457681, "learning_rate": 0.0001, "loss": 1.3932, "loss/crossentropy": 2.6345207691192627, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20180748403072357, "step": 13908 }, { "epoch": 0.20769156108377695, "grad_norm": 0.287109375, "grad_norm_var": 0.0007145563761393229, "learning_rate": 0.0001, "loss": 1.5512, "loss/crossentropy": 2.7844330072402954, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.246480330824852, "step": 13909 }, { "epoch": 0.20770649325439192, "grad_norm": 0.30078125, "grad_norm_var": 0.0007007439931233724, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.7082360982894897, "loss/fcd": 1.22265625, "loss/idx": 11.0, "loss/logits": 0.2151520997285843, "step": 13910 }, { "epoch": 0.2077214254250069, "grad_norm": 0.31640625, "grad_norm_var": 0.0006942590077718099, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.3909709453582764, "loss/fcd": 1.328125, "loss/idx": 11.0, "loss/logits": 0.22847380489110947, "step": 13911 }, { "epoch": 0.2077363575956219, "grad_norm": 0.29296875, "grad_norm_var": 0.0006685892740885417, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.521252989768982, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1954377293586731, "step": 13912 }, { "epoch": 0.20775128976623686, "grad_norm": 0.279296875, "grad_norm_var": 0.0007050673166910808, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 2.4671413898468018, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19085149466991425, "step": 13913 }, { "epoch": 0.20776622193685185, "grad_norm": 0.296875, "grad_norm_var": 0.0006748040517171224, "learning_rate": 0.0001, "loss": 1.3263, "loss/crossentropy": 2.6037209033966064, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1778687760233879, "step": 13914 }, { "epoch": 0.20778115410746684, "grad_norm": 0.3359375, "grad_norm_var": 0.0007196426391601563, "learning_rate": 0.0001, "loss": 1.4529, "loss/crossentropy": 2.653137683868408, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.214620940387249, "step": 13915 }, { "epoch": 0.20779608627808183, "grad_norm": 0.291015625, "grad_norm_var": 0.0007317701975504558, "learning_rate": 0.0001, "loss": 1.4064, "loss/crossentropy": 2.638505458831787, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.20722603052854538, "step": 13916 }, { "epoch": 0.2078110184486968, "grad_norm": 0.279296875, "grad_norm_var": 0.0007379531860351562, "learning_rate": 0.0001, "loss": 1.3479, "loss/crossentropy": 2.757821798324585, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19941389560699463, "step": 13917 }, { "epoch": 0.20782595061931178, "grad_norm": 0.27734375, "grad_norm_var": 0.000781106948852539, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.6135069131851196, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.17515219748020172, "step": 13918 }, { "epoch": 0.20784088278992677, "grad_norm": 0.298828125, "grad_norm_var": 0.0007389704386393229, "learning_rate": 0.0001, "loss": 1.3959, "loss/crossentropy": 2.51410710811615, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.19666799157857895, "step": 13919 }, { "epoch": 0.20785581496054173, "grad_norm": 0.291015625, "grad_norm_var": 0.0006879011789957683, "learning_rate": 0.0001, "loss": 1.473, "loss/crossentropy": 2.7266396284103394, "loss/fcd": 1.24609375, "loss/idx": 11.0, "loss/logits": 0.2269364520907402, "step": 13920 }, { "epoch": 0.20787074713115672, "grad_norm": 0.271484375, "grad_norm_var": 0.00043307940165201824, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.476930260658264, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.2052198126912117, "step": 13921 }, { "epoch": 0.2078856793017717, "grad_norm": 0.34375, "grad_norm_var": 0.0005813439687093099, "learning_rate": 0.0001, "loss": 1.6301, "loss/crossentropy": 2.59790575504303, "loss/fcd": 1.37890625, "loss/idx": 11.0, "loss/logits": 0.25123079121112823, "step": 13922 }, { "epoch": 0.20790061147238667, "grad_norm": 0.341796875, "grad_norm_var": 0.0006080468495686849, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.4957157373428345, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.22066517919301987, "step": 13923 }, { "epoch": 0.20791554364300166, "grad_norm": 0.294921875, "grad_norm_var": 0.0005213260650634765, "learning_rate": 0.0001, "loss": 1.3357, "loss/crossentropy": 2.4794511795043945, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17557836323976517, "step": 13924 }, { "epoch": 0.20793047581361665, "grad_norm": 0.3046875, "grad_norm_var": 0.0005105972290039062, "learning_rate": 0.0001, "loss": 1.3384, "loss/crossentropy": 2.7241528034210205, "loss/fcd": 1.16015625, "loss/idx": 11.0, "loss/logits": 0.17820703238248825, "step": 13925 }, { "epoch": 0.20794540798423164, "grad_norm": 0.28125, "grad_norm_var": 0.0005350748697916667, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.7146854400634766, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.16758227348327637, "step": 13926 }, { "epoch": 0.2079603401548466, "grad_norm": 0.2890625, "grad_norm_var": 0.0005212783813476563, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.763953447341919, "loss/fcd": 1.20703125, "loss/idx": 11.0, "loss/logits": 0.19353967159986496, "step": 13927 }, { "epoch": 0.2079752723254616, "grad_norm": 0.27734375, "grad_norm_var": 0.0005472183227539062, "learning_rate": 0.0001, "loss": 1.3065, "loss/crossentropy": 2.8591142892837524, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.16975657641887665, "step": 13928 }, { "epoch": 0.20799020449607658, "grad_norm": 0.30859375, "grad_norm_var": 0.0005312442779541015, "learning_rate": 0.0001, "loss": 1.414, "loss/crossentropy": 2.750168204307556, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.1992030367255211, "step": 13929 }, { "epoch": 0.20800513666669154, "grad_norm": 0.271484375, "grad_norm_var": 0.0005785624186197916, "learning_rate": 0.0001, "loss": 1.2646, "loss/crossentropy": 2.5619980096817017, "loss/fcd": 1.09765625, "loss/idx": 11.0, "loss/logits": 0.1668967381119728, "step": 13930 }, { "epoch": 0.20802006883730653, "grad_norm": 0.326171875, "grad_norm_var": 0.0005342960357666016, "learning_rate": 0.0001, "loss": 1.3798, "loss/crossentropy": 2.7260074615478516, "loss/fcd": 1.1875, "loss/idx": 11.0, "loss/logits": 0.19233888387680054, "step": 13931 }, { "epoch": 0.20803500100792152, "grad_norm": 0.291015625, "grad_norm_var": 0.0005342960357666016, "learning_rate": 0.0001, "loss": 1.4893, "loss/crossentropy": 2.5547302961349487, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.22757943719625473, "step": 13932 }, { "epoch": 0.2080499331785365, "grad_norm": 0.29296875, "grad_norm_var": 0.0005141576131184896, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.5887277126312256, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19947806000709534, "step": 13933 }, { "epoch": 0.20806486534915147, "grad_norm": 0.3203125, "grad_norm_var": 0.000513458251953125, "learning_rate": 0.0001, "loss": 1.5026, "loss/crossentropy": 2.52135968208313, "loss/fcd": 1.28125, "loss/idx": 11.0, "loss/logits": 0.22133152931928635, "step": 13934 }, { "epoch": 0.20807979751976646, "grad_norm": 0.369140625, "grad_norm_var": 0.0008087158203125, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.7430185079574585, "loss/fcd": 1.38671875, "loss/idx": 11.0, "loss/logits": 0.24195396900177002, "step": 13935 }, { "epoch": 0.20809472969038145, "grad_norm": 0.341796875, "grad_norm_var": 0.0008773167928059895, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.5794293880462646, "loss/fcd": 1.296875, "loss/idx": 11.0, "loss/logits": 0.21267899125814438, "step": 13936 }, { "epoch": 0.20810966186099641, "grad_norm": 0.359375, "grad_norm_var": 0.0009338219960530598, "learning_rate": 0.0001, "loss": 1.5301, "loss/crossentropy": 2.689708709716797, "loss/fcd": 1.3125, "loss/idx": 11.0, "loss/logits": 0.21757178008556366, "step": 13937 }, { "epoch": 0.2081245940316114, "grad_norm": 0.337890625, "grad_norm_var": 0.0009122212727864583, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.6403188705444336, "loss/fcd": 1.3046875, "loss/idx": 11.0, "loss/logits": 0.2593136876821518, "step": 13938 }, { "epoch": 0.2081395262022264, "grad_norm": 0.31640625, "grad_norm_var": 0.0008549849192301432, "learning_rate": 0.0001, "loss": 1.4688, "loss/crossentropy": 2.247285008430481, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.19539380073547363, "step": 13939 }, { "epoch": 0.20815445837284136, "grad_norm": 0.330078125, "grad_norm_var": 0.0008549849192301432, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.3941701650619507, "loss/fcd": 1.34375, "loss/idx": 11.0, "loss/logits": 0.20448487251996994, "step": 13940 }, { "epoch": 0.20816939054345635, "grad_norm": 0.326171875, "grad_norm_var": 0.000858306884765625, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.7668012380599976, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21892518550157547, "step": 13941 }, { "epoch": 0.20818432271407133, "grad_norm": 0.275390625, "grad_norm_var": 0.0008867740631103516, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.5989872217178345, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.19520960748195648, "step": 13942 }, { "epoch": 0.20819925488468632, "grad_norm": 0.337890625, "grad_norm_var": 0.0008696873982747395, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.5438849925994873, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.16362987458705902, "step": 13943 }, { "epoch": 0.2082141870553013, "grad_norm": 0.330078125, "grad_norm_var": 0.0007602532704671223, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.678866386413574, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.20138388127088547, "step": 13944 }, { "epoch": 0.20822911922591628, "grad_norm": 0.271484375, "grad_norm_var": 0.0009073257446289062, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.5707035064697266, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17357101291418076, "step": 13945 }, { "epoch": 0.20824405139653127, "grad_norm": 0.333984375, "grad_norm_var": 0.0007588068644205729, "learning_rate": 0.0001, "loss": 1.4973, "loss/crossentropy": 2.729626417160034, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.22387037426233292, "step": 13946 }, { "epoch": 0.20825898356714623, "grad_norm": 0.328125, "grad_norm_var": 0.0007599989573160807, "learning_rate": 0.0001, "loss": 1.5309, "loss/crossentropy": 2.2318042516708374, "loss/fcd": 1.31640625, "loss/idx": 11.0, "loss/logits": 0.21454021334648132, "step": 13947 }, { "epoch": 0.20827391573776122, "grad_norm": 0.296875, "grad_norm_var": 0.0007374445597330729, "learning_rate": 0.0001, "loss": 1.3276, "loss/crossentropy": 2.7099530696868896, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.17137396335601807, "step": 13948 }, { "epoch": 0.2082888479083762, "grad_norm": 0.294921875, "grad_norm_var": 0.0007298628489176433, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.588147282600403, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.23244262486696243, "step": 13949 }, { "epoch": 0.2083037800789912, "grad_norm": 0.27734375, "grad_norm_var": 0.0008613427480061848, "learning_rate": 0.0001, "loss": 1.3488, "loss/crossentropy": 2.640983462333679, "loss/fcd": 1.16796875, "loss/idx": 11.0, "loss/logits": 0.18084166198968887, "step": 13950 }, { "epoch": 0.20831871224960616, "grad_norm": 0.306640625, "grad_norm_var": 0.0006995995839436849, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.7015082836151123, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.1701069176197052, "step": 13951 }, { "epoch": 0.20833364442022115, "grad_norm": 0.26171875, "grad_norm_var": 0.0008305867513020833, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.4549888372421265, "loss/fcd": 1.109375, "loss/idx": 11.0, "loss/logits": 0.17703990638256073, "step": 13952 }, { "epoch": 0.20834857659083614, "grad_norm": 0.458984375, "grad_norm_var": 0.0020862420399983725, "learning_rate": 0.0001, "loss": 1.9388, "loss/crossentropy": 2.636969208717346, "loss/fcd": 1.5546875, "loss/idx": 11.0, "loss/logits": 0.38409335911273956, "step": 13953 }, { "epoch": 0.2083635087614511, "grad_norm": 0.30078125, "grad_norm_var": 0.0020726521809895835, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.527142882347107, "loss/fcd": 1.25, "loss/idx": 11.0, "loss/logits": 0.20429198443889618, "step": 13954 }, { "epoch": 0.2083784409320661, "grad_norm": 0.263671875, "grad_norm_var": 0.0022395928700764973, "learning_rate": 0.0001, "loss": 1.4084, "loss/crossentropy": 2.4064581394195557, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19744478166103363, "step": 13955 }, { "epoch": 0.20839337310268108, "grad_norm": 0.310546875, "grad_norm_var": 0.0022167046864827473, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.675121784210205, "loss/fcd": 1.1328125, "loss/idx": 11.0, "loss/logits": 0.16745173931121826, "step": 13956 }, { "epoch": 0.20840830527329604, "grad_norm": 0.3359375, "grad_norm_var": 0.0022425333658854167, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.6574106216430664, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.19556811451911926, "step": 13957 }, { "epoch": 0.20842323744391103, "grad_norm": 0.28515625, "grad_norm_var": 0.0022014458974202473, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.2855417728424072, "loss/fcd": 1.21875, "loss/idx": 11.0, "loss/logits": 0.21310727298259735, "step": 13958 }, { "epoch": 0.20843816961452602, "grad_norm": 0.2890625, "grad_norm_var": 0.002182769775390625, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.5471078157424927, "loss/fcd": 1.23046875, "loss/idx": 11.0, "loss/logits": 0.20128624886274338, "step": 13959 }, { "epoch": 0.208453101785141, "grad_norm": 0.310546875, "grad_norm_var": 0.002151934305826823, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.802828550338745, "loss/fcd": 1.18359375, "loss/idx": 11.0, "loss/logits": 0.19993221759796143, "step": 13960 }, { "epoch": 0.20846803395575597, "grad_norm": 0.29296875, "grad_norm_var": 0.002076578140258789, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.6505292654037476, "loss/fcd": 1.171875, "loss/idx": 11.0, "loss/logits": 0.18733200430870056, "step": 13961 }, { "epoch": 0.20848296612637096, "grad_norm": 0.314453125, "grad_norm_var": 0.0020358880360921224, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 2.7529841661453247, "loss/fcd": 1.23828125, "loss/idx": 11.0, "loss/logits": 0.21185874938964844, "step": 13962 }, { "epoch": 0.20849789829698595, "grad_norm": 0.3125, "grad_norm_var": 0.0020091851552327474, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.690936803817749, "loss/fcd": 1.2734375, "loss/idx": 11.0, "loss/logits": 0.23003316670656204, "step": 13963 }, { "epoch": 0.2085128304676009, "grad_norm": 0.29296875, "grad_norm_var": 0.002015415827433268, "learning_rate": 0.0001, "loss": 1.323, "loss/crossentropy": 2.5651973485946655, "loss/fcd": 1.15234375, "loss/idx": 11.0, "loss/logits": 0.17063120752573013, "step": 13964 }, { "epoch": 0.2085277626382159, "grad_norm": 0.265625, "grad_norm_var": 0.002115313212076823, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.507699966430664, "loss/fcd": 1.19921875, "loss/idx": 11.0, "loss/logits": 0.18287690728902817, "step": 13965 }, { "epoch": 0.2085426948088309, "grad_norm": 0.306640625, "grad_norm_var": 0.002061192194620768, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.7584887742996216, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.21728814393281937, "step": 13966 }, { "epoch": 0.20855762697944585, "grad_norm": 0.2734375, "grad_norm_var": 0.0021306355794270832, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.5302960872650146, "loss/fcd": 1.13671875, "loss/idx": 11.0, "loss/logits": 0.18375377357006073, "step": 13967 }, { "epoch": 0.20857255915006084, "grad_norm": 0.322265625, "grad_norm_var": 0.0020128726959228516, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.7300697565078735, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.2039625197649002, "step": 13968 }, { "epoch": 0.20858749132067583, "grad_norm": 0.259765625, "grad_norm_var": 0.0004953861236572266, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.725586175918579, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.19305647909641266, "step": 13969 }, { "epoch": 0.20860242349129082, "grad_norm": 0.306640625, "grad_norm_var": 0.000501251220703125, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.6912288665771484, "loss/fcd": 1.2578125, "loss/idx": 11.0, "loss/logits": 0.20177149027585983, "step": 13970 }, { "epoch": 0.20861735566190578, "grad_norm": 0.294921875, "grad_norm_var": 0.0004259745279947917, "learning_rate": 0.0001, "loss": 1.4203, "loss/crossentropy": 2.797803521156311, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20933927595615387, "step": 13971 }, { "epoch": 0.20863228783252077, "grad_norm": 0.28125, "grad_norm_var": 0.00043193499247233075, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.3040377497673035, "loss/fcd": 1.125, "loss/idx": 11.0, "loss/logits": 0.15295425057411194, "step": 13972 }, { "epoch": 0.20864722000313576, "grad_norm": 0.322265625, "grad_norm_var": 0.00037174224853515626, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 2.5778888463974, "loss/fcd": 1.26953125, "loss/idx": 11.0, "loss/logits": 0.23118796199560165, "step": 13973 }, { "epoch": 0.20866215217375073, "grad_norm": 0.27734375, "grad_norm_var": 0.0003864924112955729, "learning_rate": 0.0001, "loss": 1.415, "loss/crossentropy": 2.384514808654785, "loss/fcd": 1.2109375, "loss/idx": 11.0, "loss/logits": 0.20407728850841522, "step": 13974 }, { "epoch": 0.20867708434436572, "grad_norm": 0.369140625, "grad_norm_var": 0.0007221062978108723, "learning_rate": 0.0001, "loss": 1.3053, "loss/crossentropy": 2.4991849660873413, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1764342114329338, "step": 13975 }, { "epoch": 0.2086920165149807, "grad_norm": 0.28515625, "grad_norm_var": 0.0007272720336914063, "learning_rate": 0.0001, "loss": 1.3024, "loss/crossentropy": 2.6412293910980225, "loss/fcd": 1.12890625, "loss/idx": 11.0, "loss/logits": 0.1734946072101593, "step": 13976 }, { "epoch": 0.2087069486855957, "grad_norm": 0.3125, "grad_norm_var": 0.0007364908854166667, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.693666934967041, "loss/fcd": 1.2265625, "loss/idx": 11.0, "loss/logits": 0.20647503435611725, "step": 13977 }, { "epoch": 0.20872188085621066, "grad_norm": 0.265625, "grad_norm_var": 0.0007901350657145182, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.5355948209762573, "loss/fcd": 1.06640625, "loss/idx": 11.0, "loss/logits": 0.15326430648565292, "step": 13978 }, { "epoch": 0.20873681302682565, "grad_norm": 0.263671875, "grad_norm_var": 0.0008366266886393229, "learning_rate": 0.0001, "loss": 1.3181, "loss/crossentropy": 2.6419827938079834, "loss/fcd": 1.1484375, "loss/idx": 11.0, "loss/logits": 0.1696612909436226, "step": 13979 }, { "epoch": 0.20875174519744064, "grad_norm": 0.298828125, "grad_norm_var": 0.0008382002512613932, "learning_rate": 0.0001, "loss": 1.3818, "loss/crossentropy": 2.346684217453003, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1904146745800972, "step": 13980 }, { "epoch": 0.2087666773680556, "grad_norm": 0.3203125, "grad_norm_var": 0.0008177280426025391, "learning_rate": 0.0001, "loss": 1.387, "loss/crossentropy": 2.6723073720932007, "loss/fcd": 1.1953125, "loss/idx": 11.0, "loss/logits": 0.1917177066206932, "step": 13981 }, { "epoch": 0.2087816095386706, "grad_norm": 0.52734375, "grad_norm_var": 0.004131507873535156, "learning_rate": 0.0001, "loss": 1.547, "loss/crossentropy": 2.546846628189087, "loss/fcd": 1.30859375, "loss/idx": 11.0, "loss/logits": 0.2384396344423294, "step": 13982 }, { "epoch": 0.20879654170928558, "grad_norm": 0.3203125, "grad_norm_var": 0.004032325744628906, "learning_rate": 0.0001, "loss": 1.3126, "loss/crossentropy": 2.5886967182159424, "loss/fcd": 1.140625, "loss/idx": 11.0, "loss/logits": 0.17193065583705902, "step": 13983 }, { "epoch": 0.20881147387990054, "grad_norm": 0.333984375, "grad_norm_var": 0.004053497314453125, "learning_rate": 0.0001, "loss": 1.5024, "loss/crossentropy": 2.9132444858551025, "loss/fcd": 1.29296875, "loss/idx": 11.0, "loss/logits": 0.20938582718372345, "step": 13984 }, { "epoch": 0.20882640605051553, "grad_norm": 0.2890625, "grad_norm_var": 0.003891611099243164, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.495837450027466, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.2111111879348755, "step": 13985 }, { "epoch": 0.20884133822113052, "grad_norm": 0.341796875, "grad_norm_var": 0.003921365737915039, "learning_rate": 0.0001, "loss": 1.7241, "loss/crossentropy": 2.621558666229248, "loss/fcd": 1.453125, "loss/idx": 11.0, "loss/logits": 0.27094434201717377, "step": 13986 }, { "epoch": 0.2088562703917455, "grad_norm": 0.55859375, "grad_norm_var": 0.007421112060546875, "learning_rate": 0.0001, "loss": 1.4923, "loss/crossentropy": 2.603263020515442, "loss/fcd": 1.27734375, "loss/idx": 11.0, "loss/logits": 0.21492433547973633, "step": 13987 }, { "epoch": 0.20887120256236047, "grad_norm": 0.3046875, "grad_norm_var": 0.00728607177734375, "learning_rate": 0.0001, "loss": 1.3421, "loss/crossentropy": 2.535524845123291, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.18582819402217865, "step": 13988 }, { "epoch": 0.20888613473297546, "grad_norm": 0.2890625, "grad_norm_var": 0.0074198246002197266, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.4599941968917847, "loss/fcd": 1.15625, "loss/idx": 11.0, "loss/logits": 0.1678663045167923, "step": 13989 }, { "epoch": 0.20890106690359045, "grad_norm": 0.3046875, "grad_norm_var": 0.007256937026977539, "learning_rate": 0.0001, "loss": 1.2743, "loss/crossentropy": 2.6629343032836914, "loss/fcd": 1.10546875, "loss/idx": 11.0, "loss/logits": 0.1688777059316635, "step": 13990 }, { "epoch": 0.2089159990742054, "grad_norm": 0.2890625, "grad_norm_var": 0.007309722900390625, "learning_rate": 0.0001, "loss": 1.3811, "loss/crossentropy": 2.738383412361145, "loss/fcd": 1.19140625, "loss/idx": 11.0, "loss/logits": 0.1897265911102295, "step": 13991 }, { "epoch": 0.2089309312448204, "grad_norm": 0.291015625, "grad_norm_var": 0.007275629043579102, "learning_rate": 0.0001, "loss": 1.3655, "loss/crossentropy": 2.645993947982788, "loss/fcd": 1.1796875, "loss/idx": 11.0, "loss/logits": 0.18582303076982498, "step": 13992 }, { "epoch": 0.2089458634154354, "grad_norm": 0.33203125, "grad_norm_var": 0.007248926162719727, "learning_rate": 0.0001, "loss": 1.4739, "loss/crossentropy": 2.5313159227371216, "loss/fcd": 1.265625, "loss/idx": 11.0, "loss/logits": 0.2082987055182457, "step": 13993 }, { "epoch": 0.20896079558605038, "grad_norm": 0.3828125, "grad_norm_var": 0.007052469253540039, "learning_rate": 0.0001, "loss": 1.5785, "loss/crossentropy": 2.471598505973816, "loss/fcd": 1.3359375, "loss/idx": 11.0, "loss/logits": 0.24255813658237457, "step": 13994 }, { "epoch": 0.20897572775666534, "grad_norm": 0.283203125, "grad_norm_var": 0.006876357396443685, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.7201918363571167, "loss/fcd": 1.203125, "loss/idx": 11.0, "loss/logits": 0.20905796438455582, "step": 13995 }, { "epoch": 0.20899065992728033, "grad_norm": 0.3046875, "grad_norm_var": 0.00684502919514974, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.2795405983924866, "loss/fcd": 1.2421875, "loss/idx": 11.0, "loss/logits": 0.1810045689344406, "step": 13996 }, { "epoch": 0.20900559209789532, "grad_norm": 0.29296875, "grad_norm_var": 0.006970977783203125, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.8397382497787476, "loss/fcd": 1.0859375, "loss/idx": 11.0, "loss/logits": 0.1756037399172783, "step": 13997 }, { "epoch": 0.20902052426851028, "grad_norm": 0.306640625, "grad_norm_var": 0.004512135187784831, "learning_rate": 0.0001, "loss": 1.4355, "loss/crossentropy": 2.398559093475342, "loss/fcd": 1.21484375, "loss/idx": 11.0, "loss/logits": 0.22061404585838318, "step": 13998 }, { "epoch": 0.20903545643912527, "grad_norm": 0.2734375, "grad_norm_var": 0.004688374201456706, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.493852972984314, "loss/fcd": 1.09375, "loss/idx": 11.0, "loss/logits": 0.16541118174791336, "step": 13999 }, { "epoch": 0.20905038860974026, "grad_norm": 0.3203125, "grad_norm_var": 0.004681142171223959, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.271965742111206, "loss/fcd": 1.26171875, "loss/idx": 11.0, "loss/logits": 0.21084365993738174, "step": 14000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.826305691615232e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }