diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2518 +1,10806 @@ { - "best_global_step": 800, - "best_metric": 74.85714285714286, - "best_model_checkpoint": "/data/datht163/benchmark/checkpoint-800", - "epoch": 19.752351097178682, + "best_global_step": 3000, + "best_metric": 88.02395209580838, + "best_model_checkpoint": "/data/datht163/benchmark/checkpoint-3000", + "epoch": 19.998003992015967, "eval_steps": 500, - "global_step": 1580, + "global_step": 7500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.06269592476489028, - "grad_norm": 20.461719512939453, - "learning_rate": 6.25e-05, - "loss": 4.5641, + "epoch": 0.01330671989354624, + "grad_norm": 60.81864547729492, + "learning_rate": 1.3333333333333333e-05, + "loss": 4.6562, "step": 5 }, { - "epoch": 0.12539184952978055, - "grad_norm": 45.15224075317383, - "learning_rate": 0.000125, - "loss": 4.8469, + "epoch": 0.02661343978709248, + "grad_norm": 37.34549331665039, + "learning_rate": 2.6666666666666667e-05, + "loss": 4.0133, "step": 10 }, { - "epoch": 0.18808777429467086, - "grad_norm": 22.794708251953125, - "learning_rate": 0.0001875, - "loss": 4.368, + "epoch": 0.03992015968063872, + "grad_norm": 11.796462059020996, + "learning_rate": 4e-05, + "loss": 2.6719, "step": 15 }, { - "epoch": 0.2507836990595611, - "grad_norm": 19.09231185913086, - "learning_rate": 0.00019999677214588312, - "loss": 3.9258, + "epoch": 0.05322687957418496, + "grad_norm": 8.3694429397583, + "learning_rate": 5.333333333333333e-05, + "loss": 3.0212, "step": 20 }, { - "epoch": 0.31347962382445144, - "grad_norm": 15.92790412902832, - "learning_rate": 0.0001999836593456696, - "loss": 3.2234, + "epoch": 0.0665335994677312, + "grad_norm": 15.191309928894043, + "learning_rate": 6.666666666666667e-05, + "loss": 2.2947, "step": 25 }, { - "epoch": 0.3761755485893417, - "grad_norm": 19.797821044921875, - "learning_rate": 0.00019996046118014955, - "loss": 3.2684, + "epoch": 0.07984031936127745, + "grad_norm": 5.090390682220459, + "learning_rate": 8e-05, + "loss": 1.9277, "step": 30 }, { - "epoch": 0.438871473354232, - "grad_norm": 19.880847930908203, - "learning_rate": 0.00019992717998932507, - "loss": 3.1656, + "epoch": 0.09314703925482369, + "grad_norm": 11.129491806030273, + "learning_rate": 9.333333333333334e-05, + "loss": 2.1779, "step": 35 }, { - "epoch": 0.5015673981191222, - "grad_norm": 19.707406997680664, - "learning_rate": 0.00019988381913027442, - "loss": 3.0031, + "epoch": 0.10645375914836992, + "grad_norm": 7.373464584350586, + "learning_rate": 0.00010666666666666667, + "loss": 2.0246, "step": 40 }, { - "epoch": 0.5642633228840125, - "grad_norm": 14.307902336120605, - "learning_rate": 0.00019983038297681336, - "loss": 2.7219, + "epoch": 0.11976047904191617, + "grad_norm": 11.810800552368164, + "learning_rate": 0.00012, + "loss": 1.8453, "step": 45 }, { - "epoch": 0.6269592476489029, - "grad_norm": 13.697257995605469, - "learning_rate": 0.00019976687691905393, - "loss": 2.6117, + "epoch": 0.1330671989354624, + "grad_norm": 11.086739540100098, + "learning_rate": 0.00013333333333333334, + "loss": 2.0291, "step": 50 }, { - "epoch": 0.6896551724137931, - "grad_norm": 20.15327262878418, - "learning_rate": 0.0001996933073628608, - "loss": 2.7996, + "epoch": 0.14637391882900866, + "grad_norm": 9.655720710754395, + "learning_rate": 0.00014666666666666666, + "loss": 1.8613, "step": 55 }, { - "epoch": 0.7523510971786834, - "grad_norm": 20.71539878845215, - "learning_rate": 0.00019960968172920516, - "loss": 2.5578, + "epoch": 0.1596806387225549, + "grad_norm": 9.862675666809082, + "learning_rate": 0.00016, + "loss": 1.5939, "step": 60 }, { - "epoch": 0.8150470219435737, - "grad_norm": 8.937211036682129, - "learning_rate": 0.00019951600845341595, - "loss": 2.9141, + "epoch": 0.17298735861610112, + "grad_norm": 7.531759262084961, + "learning_rate": 0.00017333333333333334, + "loss": 2.0128, "step": 65 }, { - "epoch": 0.877742946708464, - "grad_norm": 8.478225708007812, - "learning_rate": 0.0001994122969843293, - "loss": 2.8832, + "epoch": 0.18629407850964738, + "grad_norm": 9.651808738708496, + "learning_rate": 0.0001866666666666667, + "loss": 1.7102, "step": 70 }, { - "epoch": 0.9404388714733543, - "grad_norm": 15.695404052734375, - "learning_rate": 0.00019929855778333516, - "loss": 2.7437, + "epoch": 0.1996007984031936, + "grad_norm": 9.359721183776855, + "learning_rate": 0.0002, + "loss": 1.7814, "step": 75 }, { - "epoch": 1.0, - "grad_norm": 19.672351837158203, - "learning_rate": 0.00019917480232332224, - "loss": 2.2395, + "epoch": 0.21290751829673984, + "grad_norm": 14.421661376953125, + "learning_rate": 0.00019999977622236462, + "loss": 1.8324, "step": 80 }, { - "epoch": 1.0, - "eval_loss": 0.6336904764175415, - "eval_macro_f1": 73.26745178358081, - "eval_macro_precision": 75.24521017228876, - "eval_macro_recall": 73.14285714285714, - "eval_micro_f1": 73.14285714285714, - "eval_micro_precision": 73.14285714285714, - "eval_micro_recall": 73.14285714285714, - "eval_runtime": 5.7289, - "eval_samples_per_second": 183.283, - "eval_steps_per_second": 11.521, - "step": 80 - }, - { - "epoch": 1.0626959247648902, - "grad_norm": 14.15684700012207, - "learning_rate": 0.0001990410430875205, - "loss": 1.4396, + "epoch": 0.2262142381902861, + "grad_norm": 8.235530853271484, + "learning_rate": 0.00019999910489045997, + "loss": 1.8022, "step": 85 }, { - "epoch": 1.1253918495297806, - "grad_norm": 20.00255012512207, - "learning_rate": 0.00019889729356824235, - "loss": 1.3728, + "epoch": 0.23952095808383234, + "grad_norm": 19.68333625793457, + "learning_rate": 0.00019999798600729064, + "loss": 1.6848, "step": 90 }, { - "epoch": 1.188087774294671, - "grad_norm": 13.56872272491455, - "learning_rate": 0.00019874356826552129, - "loss": 1.6022, + "epoch": 0.2528276779773786, + "grad_norm": 9.926045417785645, + "learning_rate": 0.00019999641957786432, + "loss": 1.4403, "step": 95 }, { - "epoch": 1.250783699059561, - "grad_norm": 6.213724613189697, - "learning_rate": 0.00019857988268564953, - "loss": 1.2819, + "epoch": 0.2661343978709248, + "grad_norm": 8.2724027633667, + "learning_rate": 0.00019999440560919152, + "loss": 2.1332, "step": 100 }, { - "epoch": 1.3134796238244515, - "grad_norm": 28.165010452270508, - "learning_rate": 0.0001984062533396137, - "loss": 1.1332, + "epoch": 0.27944111776447106, + "grad_norm": 13.359396934509277, + "learning_rate": 0.00019999194411028594, + "loss": 1.7551, "step": 105 }, { - "epoch": 1.3761755485893417, - "grad_norm": 15.799768447875977, - "learning_rate": 0.00019822269774142954, - "loss": 1.244, + "epoch": 0.2927478376580173, + "grad_norm": 11.357268333435059, + "learning_rate": 0.00019998903509216415, + "loss": 1.9558, "step": 110 }, { - "epoch": 1.438871473354232, - "grad_norm": 21.20913314819336, - "learning_rate": 0.0001980292344063752, - "loss": 1.4427, + "epoch": 0.3060545575515635, + "grad_norm": 13.34130859375, + "learning_rate": 0.0001999856785678456, + "loss": 1.6962, "step": 115 }, { - "epoch": 1.5015673981191222, - "grad_norm": 18.147506713867188, - "learning_rate": 0.0001978258828491236, - "loss": 1.4596, + "epoch": 0.3193612774451098, + "grad_norm": 4.8148698806762695, + "learning_rate": 0.0001999818745523526, + "loss": 1.6982, "step": 120 }, { - "epoch": 1.5642633228840124, - "grad_norm": 17.44733428955078, - "learning_rate": 0.00019761266358177398, - "loss": 0.9827, + "epoch": 0.33266799733865604, + "grad_norm": 6.624136447906494, + "learning_rate": 0.0001999776230627102, + "loss": 1.8629, "step": 125 }, { - "epoch": 1.626959247648903, - "grad_norm": 18.33002471923828, - "learning_rate": 0.00019738959811178272, - "loss": 1.2366, + "epoch": 0.34597471723220224, + "grad_norm": 6.114648818969727, + "learning_rate": 0.00019997292411794618, + "loss": 1.6615, "step": 130 }, { - "epoch": 1.6896551724137931, - "grad_norm": 16.217641830444336, - "learning_rate": 0.00019715670893979414, - "loss": 1.2665, + "epoch": 0.3592814371257485, + "grad_norm": 7.139102458953857, + "learning_rate": 0.00019996777773909093, + "loss": 1.4755, "step": 135 }, { - "epoch": 1.7523510971786833, - "grad_norm": 16.21353530883789, - "learning_rate": 0.00019691401955737072, - "loss": 1.5896, + "epoch": 0.37258815701929476, + "grad_norm": 15.46536636352539, + "learning_rate": 0.0001999621839491773, + "loss": 1.8553, "step": 140 }, { - "epoch": 1.8150470219435737, - "grad_norm": 11.688411712646484, - "learning_rate": 0.0001966615544446234, - "loss": 1.5775, + "epoch": 0.38589487691284097, + "grad_norm": 6.5360331535339355, + "learning_rate": 0.00019995614277324065, + "loss": 1.5998, "step": 145 }, { - "epoch": 1.877742946708464, - "grad_norm": 14.346232414245605, - "learning_rate": 0.0001963993390677424, - "loss": 1.1833, + "epoch": 0.3992015968063872, + "grad_norm": 8.404138565063477, + "learning_rate": 0.00019994965423831854, + "loss": 1.4908, "step": 150 }, { - "epoch": 1.9404388714733543, - "grad_norm": 6.273299694061279, - "learning_rate": 0.00019612739987642845, - "loss": 1.0552, + "epoch": 0.4125083166999335, + "grad_norm": 8.757245063781738, + "learning_rate": 0.00019994271837345072, + "loss": 1.6893, "step": 155 }, { - "epoch": 2.0, - "grad_norm": 17.62116050720215, - "learning_rate": 0.00019584576430122473, - "loss": 1.0835, - "step": 160 - }, - { - "epoch": 2.0, - "eval_loss": 0.7825595140457153, - "eval_macro_f1": 73.56897234236968, - "eval_macro_precision": 73.7348812342735, - "eval_macro_recall": 73.90476190476191, - "eval_micro_f1": 73.90476190476191, - "eval_micro_precision": 73.90476190476191, - "eval_micro_recall": 73.90476190476191, - "eval_runtime": 3.4148, - "eval_samples_per_second": 307.482, - "eval_steps_per_second": 19.327, + "epoch": 0.4258150365934797, + "grad_norm": 5.324100017547607, + "learning_rate": 0.00019993533520967912, + "loss": 1.4273, "step": 160 }, { - "epoch": 2.06269592476489, - "grad_norm": 4.175657749176025, - "learning_rate": 0.00019555446075075, - "loss": 0.3124, + "epoch": 0.43912175648702595, + "grad_norm": 6.764120101928711, + "learning_rate": 0.00019992750478004738, + "loss": 1.3152, "step": 165 }, { - "epoch": 2.1253918495297803, - "grad_norm": 9.987613677978516, - "learning_rate": 0.00019525351860883293, - "loss": 0.3053, + "epoch": 0.4524284763805722, + "grad_norm": 16.72058868408203, + "learning_rate": 0.00019991922711960102, + "loss": 1.5234, "step": 170 }, { - "epoch": 2.188087774294671, - "grad_norm": 27.39342498779297, - "learning_rate": 0.00019494296823154835, - "loss": 0.2825, + "epoch": 0.4657351962741184, + "grad_norm": 7.501124858856201, + "learning_rate": 0.0001999105022653872, + "loss": 1.51, "step": 175 }, { - "epoch": 2.250783699059561, - "grad_norm": 5.328784942626953, - "learning_rate": 0.000194622840944155, - "loss": 0.32, + "epoch": 0.47904191616766467, + "grad_norm": 9.424518585205078, + "learning_rate": 0.0001999013302564544, + "loss": 1.6568, "step": 180 }, { - "epoch": 2.3134796238244513, - "grad_norm": 8.484028816223145, - "learning_rate": 0.00019429316903793583, - "loss": 0.0699, + "epoch": 0.49234863606121093, + "grad_norm": 5.219127655029297, + "learning_rate": 0.0001998917111338525, + "loss": 1.3971, "step": 185 }, { - "epoch": 2.376175548589342, - "grad_norm": 16.879404067993164, - "learning_rate": 0.00019395398576694086, - "loss": 0.2403, + "epoch": 0.5056553559547572, + "grad_norm": 8.969573020935059, + "learning_rate": 0.00019988164494063226, + "loss": 1.5215, "step": 190 }, { - "epoch": 2.438871473354232, - "grad_norm": 49.66862487792969, - "learning_rate": 0.0001936053253446327, - "loss": 0.2737, + "epoch": 0.5189620758483033, + "grad_norm": 9.649981498718262, + "learning_rate": 0.00019987113172184563, + "loss": 1.5256, "step": 195 }, { - "epoch": 2.501567398119122, - "grad_norm": 17.669599533081055, - "learning_rate": 0.00019324722294043558, - "loss": 0.4838, + "epoch": 0.5322687957418496, + "grad_norm": 7.442275047302246, + "learning_rate": 0.00019986017152454495, + "loss": 1.3934, "step": 200 }, { - "epoch": 2.5642633228840124, - "grad_norm": 14.710466384887695, - "learning_rate": 0.00019287971467618766, - "loss": 0.2973, + "epoch": 0.5455755156353959, + "grad_norm": 6.018413543701172, + "learning_rate": 0.0001998487643977832, + "loss": 1.4781, "step": 205 }, { - "epoch": 2.626959247648903, - "grad_norm": 25.36009979248047, - "learning_rate": 0.00019250283762249748, - "loss": 0.4224, + "epoch": 0.5588822355289421, + "grad_norm": 4.8704705238342285, + "learning_rate": 0.00019983691039261357, + "loss": 1.6262, "step": 210 }, { - "epoch": 2.689655172413793, - "grad_norm": 12.651631355285645, - "learning_rate": 0.00019211662979500468, - "loss": 0.1925, + "epoch": 0.5721889554224884, + "grad_norm": 6.425934314727783, + "learning_rate": 0.0001998246095620893, + "loss": 1.4669, "step": 215 }, { - "epoch": 2.7523510971786833, - "grad_norm": 14.720685958862305, - "learning_rate": 0.00019172113015054532, - "loss": 0.3322, + "epoch": 0.5854956753160346, + "grad_norm": 9.25566291809082, + "learning_rate": 0.0001998118619612634, + "loss": 1.6061, "step": 220 }, { - "epoch": 2.815047021943574, - "grad_norm": 15.48449993133545, - "learning_rate": 0.00019131637858322225, - "loss": 0.1582, + "epoch": 0.5988023952095808, + "grad_norm": 3.8261756896972656, + "learning_rate": 0.00019979866764718843, + "loss": 1.4114, "step": 225 }, { - "epoch": 2.877742946708464, - "grad_norm": 8.015260696411133, - "learning_rate": 0.00019090241592038113, - "loss": 0.3334, + "epoch": 0.612109115103127, + "grad_norm": 7.696674346923828, + "learning_rate": 0.00019978502667891625, + "loss": 1.8654, "step": 230 }, { - "epoch": 2.9404388714733543, - "grad_norm": 20.96394157409668, - "learning_rate": 0.00019047928391849195, - "loss": 0.2362, + "epoch": 0.6254158349966733, + "grad_norm": 7.678075313568115, + "learning_rate": 0.0001997709391174977, + "loss": 1.6465, "step": 235 }, { - "epoch": 3.0, - "grad_norm": 37.81004333496094, - "learning_rate": 0.00019004702525893732, - "loss": 0.2647, - "step": 240 - }, - { - "epoch": 3.0, - "eval_loss": 1.3779538869857788, - "eval_macro_f1": 73.11543717930599, - "eval_macro_precision": 73.228985585862, - "eval_macro_recall": 73.23809523809524, - "eval_micro_f1": 73.23809523809524, - "eval_micro_precision": 73.23809523809524, - "eval_micro_recall": 73.23809523809524, - "eval_runtime": 3.6271, - "eval_samples_per_second": 289.487, - "eval_steps_per_second": 18.196, + "epoch": 0.6387225548902196, + "grad_norm": 6.352901458740234, + "learning_rate": 0.00019975640502598244, + "loss": 1.3295, "step": 240 }, { - "epoch": 3.06269592476489, - "grad_norm": 8.566853523254395, - "learning_rate": 0.0001896056835437069, - "loss": 0.0999, + "epoch": 0.6520292747837658, + "grad_norm": 5.327084541320801, + "learning_rate": 0.00019974142446941851, + "loss": 1.496, "step": 245 }, { - "epoch": 3.1253918495297803, - "grad_norm": 8.521112442016602, - "learning_rate": 0.0001891553032909996, - "loss": 0.0719, + "epoch": 0.6653359946773121, + "grad_norm": 6.817936897277832, + "learning_rate": 0.00019972599751485226, + "loss": 1.3759, "step": 250 }, { - "epoch": 3.188087774294671, - "grad_norm": 16.376962661743164, - "learning_rate": 0.00018869592993073258, - "loss": 0.2835, + "epoch": 0.6786427145708582, + "grad_norm": 3.684908628463745, + "learning_rate": 0.00019971012423132775, + "loss": 0.9475, "step": 255 }, { - "epoch": 3.250783699059561, - "grad_norm": 33.38531494140625, - "learning_rate": 0.0001882276097999592, - "loss": 0.2702, + "epoch": 0.6919494344644045, + "grad_norm": 5.260998725891113, + "learning_rate": 0.00019969380468988677, + "loss": 1.0685, "step": 260 }, { - "epoch": 3.3134796238244513, - "grad_norm": 24.302696228027344, - "learning_rate": 0.0001877503901381947, - "loss": 0.2763, + "epoch": 0.7052561543579507, + "grad_norm": 8.901612281799316, + "learning_rate": 0.00019967703896356823, + "loss": 1.4494, "step": 265 }, { - "epoch": 3.376175548589342, - "grad_norm": 18.85578727722168, - "learning_rate": 0.0001872643190826512, - "loss": 0.5174, + "epoch": 0.718562874251497, + "grad_norm": 6.55958890914917, + "learning_rate": 0.00019965982712740808, + "loss": 1.5111, "step": 270 }, { - "epoch": 3.438871473354232, - "grad_norm": 53.06370162963867, - "learning_rate": 0.00018676944566338213, - "loss": 0.2884, + "epoch": 0.7318695941450433, + "grad_norm": 5.0339202880859375, + "learning_rate": 0.00019964216925843874, + "loss": 1.2905, "step": 275 }, { - "epoch": 3.501567398119122, - "grad_norm": 34.46812438964844, - "learning_rate": 0.0001862658197983366, - "loss": 0.2232, + "epoch": 0.7451763140385895, + "grad_norm": 7.584630966186523, + "learning_rate": 0.00019962406543568898, + "loss": 1.2245, "step": 280 }, { - "epoch": 3.5642633228840124, - "grad_norm": 18.52553939819336, - "learning_rate": 0.00018575349228832397, - "loss": 0.1158, + "epoch": 0.7584830339321357, + "grad_norm": 10.351391792297363, + "learning_rate": 0.0001996055157401834, + "loss": 1.407, "step": 285 }, { - "epoch": 3.626959247648903, - "grad_norm": 0.9173059463500977, - "learning_rate": 0.00018523251481188986, - "loss": 0.2123, + "epoch": 0.7717897538256819, + "grad_norm": 6.787538051605225, + "learning_rate": 0.00019958652025494212, + "loss": 1.718, "step": 290 }, { - "epoch": 3.689655172413793, - "grad_norm": 12.346527099609375, - "learning_rate": 0.00018470293992010312, - "loss": 0.2126, + "epoch": 0.7850964737192282, + "grad_norm": 4.587769508361816, + "learning_rate": 0.00019956707906498044, + "loss": 1.5187, "step": 295 }, { - "epoch": 3.7523510971786833, - "grad_norm": 84.08663177490234, - "learning_rate": 0.00018416482103125506, - "loss": 0.222, + "epoch": 0.7984031936127745, + "grad_norm": 6.607381820678711, + "learning_rate": 0.00019954719225730847, + "loss": 1.4279, "step": 300 }, { - "epoch": 3.815047021943574, - "grad_norm": 19.547080993652344, - "learning_rate": 0.0001836182124254711, - "loss": 0.1389, + "epoch": 0.8117099135063207, + "grad_norm": 9.418729782104492, + "learning_rate": 0.00019952685992093063, + "loss": 1.3854, "step": 305 }, { - "epoch": 3.877742946708464, - "grad_norm": 1.2908215522766113, - "learning_rate": 0.00018306316923923563, - "loss": 0.1632, + "epoch": 0.825016633399867, + "grad_norm": 7.311931133270264, + "learning_rate": 0.00019950608214684535, + "loss": 1.5504, "step": 310 }, { - "epoch": 3.9404388714733543, - "grad_norm": 30.96961784362793, - "learning_rate": 0.00018249974745983023, - "loss": 0.2089, + "epoch": 0.8383233532934131, + "grad_norm": 7.015618324279785, + "learning_rate": 0.0001994848590280447, + "loss": 1.528, "step": 315 }, { - "epoch": 4.0, - "grad_norm": 13.291217803955078, - "learning_rate": 0.00018192800391968642, - "loss": 0.3228, - "step": 320 - }, - { - "epoch": 4.0, - "eval_loss": 1.7461718320846558, - "eval_macro_f1": 72.72572059706258, - "eval_macro_precision": 72.71473132903009, - "eval_macro_recall": 72.76190476190476, - "eval_micro_f1": 72.76190476190476, - "eval_micro_precision": 72.76190476190476, - "eval_micro_recall": 72.76190476190476, - "eval_runtime": 4.7844, - "eval_samples_per_second": 219.463, - "eval_steps_per_second": 13.795, + "epoch": 0.8516300731869594, + "grad_norm": 5.8235697746276855, + "learning_rate": 0.00019946319065951382, + "loss": 1.0753, "step": 320 }, { - "epoch": 4.06269592476489, - "grad_norm": 22.18653678894043, - "learning_rate": 0.00018134799629065276, - "loss": 0.3768, + "epoch": 0.8649367930805056, + "grad_norm": 4.4757513999938965, + "learning_rate": 0.0001994410771382307, + "loss": 1.4042, "step": 325 }, { - "epoch": 4.12539184952978, - "grad_norm": 18.046934127807617, - "learning_rate": 0.00018075978307817764, - "loss": 0.1879, + "epoch": 0.8782435129740519, + "grad_norm": 6.9428510665893555, + "learning_rate": 0.00019941851856316548, + "loss": 1.3328, "step": 330 }, { - "epoch": 4.1880877742946705, - "grad_norm": 8.421941757202148, - "learning_rate": 0.00018016342361540773, - "loss": 0.2067, + "epoch": 0.8915502328675982, + "grad_norm": 5.409964084625244, + "learning_rate": 0.0001993955150352803, + "loss": 1.2911, "step": 335 }, { - "epoch": 4.250783699059561, - "grad_norm": 37.9600830078125, - "learning_rate": 0.0001795589780572031, - "loss": 0.1612, + "epoch": 0.9048569527611444, + "grad_norm": 6.9928297996521, + "learning_rate": 0.0001993720666575287, + "loss": 1.379, "step": 340 }, { - "epoch": 4.313479623824452, - "grad_norm": 2.552628993988037, - "learning_rate": 0.0001789465073740693, - "loss": 0.0256, + "epoch": 0.9181636726546906, + "grad_norm": 5.91607666015625, + "learning_rate": 0.00019934817353485501, + "loss": 1.3604, "step": 345 }, { - "epoch": 4.376175548589342, - "grad_norm": 45.092559814453125, - "learning_rate": 0.00017832607334600746, - "loss": 0.2179, + "epoch": 0.9314703925482368, + "grad_norm": 6.265745162963867, + "learning_rate": 0.00019932383577419432, + "loss": 1.3839, "step": 350 }, { - "epoch": 4.438871473354232, - "grad_norm": 1.6102479696273804, - "learning_rate": 0.00017769773855628226, - "loss": 0.1914, + "epoch": 0.9447771124417831, + "grad_norm": 6.445305824279785, + "learning_rate": 0.0001992990534844714, + "loss": 1.3727, "step": 355 }, { - "epoch": 4.501567398119122, - "grad_norm": 27.624589920043945, - "learning_rate": 0.0001770615663851093, - "loss": 0.1637, + "epoch": 0.9580838323353293, + "grad_norm": 8.271346092224121, + "learning_rate": 0.00019927382677660088, + "loss": 1.4262, "step": 360 }, { - "epoch": 4.564263322884012, - "grad_norm": 31.627342224121094, - "learning_rate": 0.00017641762100326208, - "loss": 0.1678, + "epoch": 0.9713905522288756, + "grad_norm": 5.764677047729492, + "learning_rate": 0.00019924815576348603, + "loss": 1.4416, "step": 365 }, { - "epoch": 4.6269592476489025, - "grad_norm": 17.774938583374023, - "learning_rate": 0.0001757659673655986, - "loss": 0.0717, + "epoch": 0.9846972721224219, + "grad_norm": 7.0903215408325195, + "learning_rate": 0.00019922204056001895, + "loss": 1.3508, "step": 370 }, { - "epoch": 4.689655172413794, - "grad_norm": 5.605461597442627, - "learning_rate": 0.00017510667120450998, - "loss": 0.2968, + "epoch": 0.998003992015968, + "grad_norm": 8.942666053771973, + "learning_rate": 0.00019919548128307954, + "loss": 1.2008, "step": 375 }, { - "epoch": 4.752351097178684, - "grad_norm": 27.00714683532715, - "learning_rate": 0.00017443979902328956, - "loss": 0.5085, + "epoch": 0.998003992015968, + "eval_loss": 0.37667316198349, + "eval_macro_f1": 59.54495855355076, + "eval_macro_precision": 76.51384999324237, + "eval_macro_recall": 54.28674905628468, + "eval_micro_f1": 86.71407185628742, + "eval_micro_precision": 86.71407185628742, + "eval_micro_recall": 86.71407185628742, + "eval_runtime": 6.2328, + "eval_samples_per_second": 428.701, + "eval_steps_per_second": 26.794, + "step": 375 + }, + { + "epoch": 1.0133067198935461, + "grad_norm": 4.265635967254639, + "learning_rate": 0.00019916847805153526, + "loss": 1.0752, "step": 380 }, { - "epoch": 4.815047021943574, - "grad_norm": 26.765687942504883, - "learning_rate": 0.0001737654180894249, - "loss": 0.1005, + "epoch": 1.0266134397870925, + "grad_norm": 5.709604740142822, + "learning_rate": 0.00019914103098624055, + "loss": 0.7603, "step": 385 }, { - "epoch": 4.877742946708464, - "grad_norm": 15.505696296691895, - "learning_rate": 0.00017308359642781242, - "loss": 0.197, + "epoch": 1.0399201596806387, + "grad_norm": 6.586650848388672, + "learning_rate": 0.00019911314021003613, + "loss": 0.7114, "step": 390 }, { - "epoch": 4.940438871473354, - "grad_norm": 7.485856056213379, - "learning_rate": 0.00017239440281389582, - "loss": 0.1499, + "epoch": 1.053226879574185, + "grad_norm": 3.8452811241149902, + "learning_rate": 0.00019908480584774867, + "loss": 0.4094, "step": 395 }, { - "epoch": 5.0, - "grad_norm": 2.3405601978302, - "learning_rate": 0.00017169790676672858, - "loss": 0.0747, + "epoch": 1.0665335994677312, + "grad_norm": 23.499847412109375, + "learning_rate": 0.00019905602802619007, + "loss": 0.7433, "step": 400 }, { - "epoch": 5.0, - "eval_loss": 1.7594363689422607, - "eval_macro_f1": 74.17047440340497, - "eval_macro_precision": 75.04188054945206, - "eval_macro_recall": 73.99999999999999, - "eval_micro_f1": 74.0, - "eval_micro_precision": 74.0, - "eval_micro_recall": 74.0, - "eval_runtime": 4.2956, - "eval_samples_per_second": 244.435, - "eval_steps_per_second": 15.364, - "step": 400 - }, - { - "epoch": 5.06269592476489, - "grad_norm": 11.809943199157715, - "learning_rate": 0.00017099417854196165, - "loss": 0.0426, + "epoch": 1.0798403193612773, + "grad_norm": 10.368353843688965, + "learning_rate": 0.00019902680687415705, + "loss": 1.1225, "step": 405 }, { - "epoch": 5.12539184952978, - "grad_norm": 4.720785140991211, - "learning_rate": 0.00017028328912475668, - "loss": 0.044, + "epoch": 1.0931470392548237, + "grad_norm": 10.001039505004883, + "learning_rate": 0.00019899714252243035, + "loss": 0.7539, "step": 410 }, { - "epoch": 5.1880877742946705, - "grad_norm": 14.604973793029785, - "learning_rate": 0.00016956531022262585, - "loss": 0.1949, + "epoch": 1.1064537591483699, + "grad_norm": 6.002054214477539, + "learning_rate": 0.00019896703510377436, + "loss": 0.9843, "step": 415 }, { - "epoch": 5.250783699059561, - "grad_norm": 0.6709997653961182, - "learning_rate": 0.00016884031425819853, - "loss": 0.295, + "epoch": 1.1197604790419162, + "grad_norm": 14.071710586547852, + "learning_rate": 0.00019893648475293648, + "loss": 1.1239, "step": 420 }, { - "epoch": 5.313479623824452, - "grad_norm": 69.35284423828125, - "learning_rate": 0.00016810837436191623, - "loss": 0.4807, + "epoch": 1.1330671989354624, + "grad_norm": 4.876122951507568, + "learning_rate": 0.00019890549160664632, + "loss": 0.8278, "step": 425 }, { - "epoch": 5.376175548589342, - "grad_norm": 24.146665573120117, - "learning_rate": 0.00016736956436465573, - "loss": 0.5015, + "epoch": 1.1463739188290087, + "grad_norm": 7.125596046447754, + "learning_rate": 0.00019887405580361537, + "loss": 0.8853, "step": 430 }, { - "epoch": 5.438871473354232, - "grad_norm": 3.2245795726776123, - "learning_rate": 0.0001666239587902819, - "loss": 0.1359, + "epoch": 1.159680638722555, + "grad_norm": 4.5107645988464355, + "learning_rate": 0.00019884217748453623, + "loss": 0.7046, "step": 435 }, { - "epoch": 5.501567398119122, - "grad_norm": 8.290742874145508, - "learning_rate": 0.00016587163284813032, - "loss": 0.0412, + "epoch": 1.172987358616101, + "grad_norm": 7.978573322296143, + "learning_rate": 0.00019880985679208207, + "loss": 0.9162, "step": 440 }, { - "epoch": 5.564263322884012, - "grad_norm": 21.256576538085938, - "learning_rate": 0.00016511266242542102, - "loss": 0.0626, + "epoch": 1.1862940785096474, + "grad_norm": 11.903189659118652, + "learning_rate": 0.0001987770938709057, + "loss": 0.9984, "step": 445 }, { - "epoch": 5.6269592476489025, - "grad_norm": 1.178815245628357, - "learning_rate": 0.00016434712407960373, - "loss": 0.0422, + "epoch": 1.1996007984031936, + "grad_norm": 15.054693222045898, + "learning_rate": 0.00019874388886763944, + "loss": 0.9147, "step": 450 }, { - "epoch": 5.689655172413794, - "grad_norm": 19.474632263183594, - "learning_rate": 0.00016357509503063538, - "loss": 0.3287, + "epoch": 1.21290751829674, + "grad_norm": 9.752163887023926, + "learning_rate": 0.000198710241930894, + "loss": 0.868, "step": 455 }, { - "epoch": 5.752351097178684, - "grad_norm": 38.11343002319336, - "learning_rate": 0.00016279665315319114, - "loss": 0.3171, + "epoch": 1.226214238190286, + "grad_norm": 8.092059135437012, + "learning_rate": 0.00019867615321125795, + "loss": 0.7435, "step": 460 }, { - "epoch": 5.815047021943574, - "grad_norm": 63.41005325317383, - "learning_rate": 0.00016201187696880887, - "loss": 0.2117, + "epoch": 1.2395209580838324, + "grad_norm": 8.409940719604492, + "learning_rate": 0.0001986416228612972, + "loss": 0.7041, "step": 465 }, { - "epoch": 5.877742946708464, - "grad_norm": 1.7413610219955444, - "learning_rate": 0.00016122084563796905, - "loss": 0.0836, + "epoch": 1.2528276779773786, + "grad_norm": 6.191831588745117, + "learning_rate": 0.00019860665103555415, + "loss": 0.7856, "step": 470 }, { - "epoch": 5.940438871473354, - "grad_norm": 25.483036041259766, - "learning_rate": 0.00016042363895210946, - "loss": 0.4536, + "epoch": 1.2661343978709247, + "grad_norm": 11.023136138916016, + "learning_rate": 0.00019857123789054706, + "loss": 0.9971, "step": 475 }, { - "epoch": 6.0, - "grad_norm": 13.102269172668457, - "learning_rate": 0.00015962033732557686, - "loss": 0.0868, - "step": 480 - }, - { - "epoch": 6.0, - "eval_loss": 1.5808621644973755, - "eval_macro_f1": 73.24384906714903, - "eval_macro_precision": 73.61945929207427, - "eval_macro_recall": 73.61904761904762, - "eval_micro_f1": 73.61904761904762, - "eval_micro_precision": 73.61904761904762, - "eval_micro_recall": 73.61904761904762, - "eval_runtime": 5.0263, - "eval_samples_per_second": 208.902, - "eval_steps_per_second": 13.131, + "epoch": 1.2794411177644711, + "grad_norm": 12.192377090454102, + "learning_rate": 0.00019853538358476932, + "loss": 0.9177, "step": 480 }, { - "epoch": 6.06269592476489, - "grad_norm": 6.51555871963501, - "learning_rate": 0.00015881102178751553, - "loss": 0.0678, + "epoch": 1.2927478376580173, + "grad_norm": 6.031184673309326, + "learning_rate": 0.00019849908827868876, + "loss": 0.5903, "step": 485 }, { - "epoch": 6.12539184952978, - "grad_norm": 19.54594612121582, - "learning_rate": 0.00015799577397369375, - "loss": 0.0731, + "epoch": 1.3060545575515636, + "grad_norm": 5.597287654876709, + "learning_rate": 0.00019846235213474692, + "loss": 0.8199, "step": 490 }, { - "epoch": 6.1880877742946705, - "grad_norm": 7.0642266273498535, - "learning_rate": 0.0001571746761182693, - "loss": 0.0848, + "epoch": 1.3193612774451098, + "grad_norm": 12.112035751342773, + "learning_rate": 0.00019842517531735838, + "loss": 0.976, "step": 495 }, { - "epoch": 6.250783699059561, - "grad_norm": 4.012495040893555, - "learning_rate": 0.00015634781104549442, - "loss": 0.1141, + "epoch": 1.332667997338656, + "grad_norm": 8.983979225158691, + "learning_rate": 0.00019838755799290994, + "loss": 0.6765, "step": 500 }, { - "epoch": 6.313479623824452, - "grad_norm": 2.1113383769989014, - "learning_rate": 0.00015551526216136144, - "loss": 0.0945, + "epoch": 1.3459747172322023, + "grad_norm": 11.685017585754395, + "learning_rate": 0.00019834950032975986, + "loss": 0.7698, "step": 505 }, { - "epoch": 6.376175548589342, - "grad_norm": 37.096473693847656, - "learning_rate": 0.00015467711344518942, - "loss": 0.225, + "epoch": 1.3592814371257484, + "grad_norm": 7.77982759475708, + "learning_rate": 0.00019831100249823733, + "loss": 0.7094, "step": 510 }, { - "epoch": 6.438871473354232, - "grad_norm": 25.69189453125, - "learning_rate": 0.00015383344944115324, - "loss": 0.0556, + "epoch": 1.3725881570192948, + "grad_norm": 7.004415035247803, + "learning_rate": 0.00019827206467064133, + "loss": 0.8004, "step": 515 }, { - "epoch": 6.501567398119122, - "grad_norm": 4.062742710113525, - "learning_rate": 0.00015298435524975572, - "loss": 0.0765, + "epoch": 1.385894876912841, + "grad_norm": 9.52391242980957, + "learning_rate": 0.0001982326870212402, + "loss": 0.7738, "step": 520 }, { - "epoch": 6.564263322884012, - "grad_norm": 14.947588920593262, - "learning_rate": 0.00015212991651924326, - "loss": 0.0302, + "epoch": 1.3992015968063871, + "grad_norm": 6.2501726150512695, + "learning_rate": 0.00019819286972627066, + "loss": 0.7728, "step": 525 }, { - "epoch": 6.6269592476489025, - "grad_norm": 28.04601287841797, - "learning_rate": 0.0001512702194369668, - "loss": 0.4446, + "epoch": 1.4125083166999335, + "grad_norm": 11.152785301208496, + "learning_rate": 0.00019815261296393715, + "loss": 0.946, "step": 530 }, { - "epoch": 6.689655172413794, - "grad_norm": 0.8309017419815063, - "learning_rate": 0.00015040535072068778, - "loss": 0.1228, + "epoch": 1.4258150365934796, + "grad_norm": 10.387774467468262, + "learning_rate": 0.0001981119169144109, + "loss": 0.6949, "step": 535 }, { - "epoch": 6.752351097178684, - "grad_norm": 0.3538524806499481, - "learning_rate": 0.00014953539760983122, - "loss": 0.1258, + "epoch": 1.439121756487026, + "grad_norm": 10.800827980041504, + "learning_rate": 0.00019807078175982924, + "loss": 0.9506, "step": 540 }, { - "epoch": 6.815047021943574, - "grad_norm": 0.2524064779281616, - "learning_rate": 0.00014866044785668563, - "loss": 0.036, + "epoch": 1.4524284763805722, + "grad_norm": 11.124509811401367, + "learning_rate": 0.0001980292076842947, + "loss": 1.139, "step": 545 }, { - "epoch": 6.877742946708464, - "grad_norm": 46.764183044433594, - "learning_rate": 0.00014778058971755154, - "loss": 0.294, + "epoch": 1.4657351962741183, + "grad_norm": 8.33337116241455, + "learning_rate": 0.00019798719487387428, + "loss": 0.8012, "step": 550 }, { - "epoch": 6.940438871473354, - "grad_norm": 33.56895446777344, - "learning_rate": 0.00014689591194383896, - "loss": 0.1452, + "epoch": 1.4790419161676647, + "grad_norm": 16.182756423950195, + "learning_rate": 0.00019794474351659852, + "loss": 0.7395, "step": 555 }, { - "epoch": 7.0, - "grad_norm": 0.07464821636676788, - "learning_rate": 0.00014600650377311522, - "loss": 0.0801, - "step": 560 - }, - { - "epoch": 7.0, - "eval_loss": 1.7074089050292969, - "eval_macro_f1": 74.3995628020244, - "eval_macro_precision": 74.43261914638138, - "eval_macro_recall": 74.38095238095238, - "eval_micro_f1": 74.38095238095238, - "eval_micro_precision": 74.38095238095238, - "eval_micro_recall": 74.38095238095238, - "eval_runtime": 5.0045, - "eval_samples_per_second": 209.812, - "eval_steps_per_second": 13.188, + "epoch": 1.492348636061211, + "grad_norm": 7.745892524719238, + "learning_rate": 0.0001979018538024607, + "loss": 0.7877, "step": 560 }, { - "epoch": 7.06269592476489, - "grad_norm": 24.801523208618164, - "learning_rate": 0.00014511245492010335, - "loss": 0.0224, + "epoch": 1.5056553559547572, + "grad_norm": 5.482974529266357, + "learning_rate": 0.00019785852592341598, + "loss": 0.7299, "step": 565 }, { - "epoch": 7.12539184952978, - "grad_norm": 2.976022720336914, - "learning_rate": 0.00014421385556763266, - "loss": 0.1211, + "epoch": 1.5189620758483033, + "grad_norm": 9.263693809509277, + "learning_rate": 0.00019781476007338058, + "loss": 0.9189, "step": 570 }, { - "epoch": 7.1880877742946705, - "grad_norm": 0.37286555767059326, - "learning_rate": 0.00014331079635754193, - "loss": 0.0327, + "epoch": 1.5322687957418495, + "grad_norm": 10.035237312316895, + "learning_rate": 0.00019777055644823086, + "loss": 0.7287, "step": 575 }, { - "epoch": 7.250783699059561, - "grad_norm": 0.7439866662025452, - "learning_rate": 0.0001424033683815365, - "loss": 0.0382, + "epoch": 1.5455755156353959, + "grad_norm": 6.058926105499268, + "learning_rate": 0.0001977259152458025, + "loss": 1.0947, "step": 580 }, { - "epoch": 7.313479623824452, - "grad_norm": 24.909921646118164, - "learning_rate": 0.00014149166317199954, - "loss": 0.0603, + "epoch": 1.5588822355289422, + "grad_norm": 11.625580787658691, + "learning_rate": 0.00019768083666588953, + "loss": 1.0071, "step": 585 }, { - "epoch": 7.376175548589342, - "grad_norm": 9.340410232543945, - "learning_rate": 0.0001405757726927595, - "loss": 0.2319, + "epoch": 1.5721889554224884, + "grad_norm": 6.705043315887451, + "learning_rate": 0.00019763532091024352, + "loss": 0.8665, "step": 590 }, { - "epoch": 7.438871473354232, - "grad_norm": 3.947765350341797, - "learning_rate": 0.00013965578932981346, - "loss": 0.0448, + "epoch": 1.5854956753160345, + "grad_norm": 5.264439582824707, + "learning_rate": 0.00019758936818257264, + "loss": 0.767, "step": 595 }, { - "epoch": 7.501567398119122, - "grad_norm": 0.29921436309814453, - "learning_rate": 0.00013873180588200827, - "loss": 0.006, + "epoch": 1.5988023952095807, + "grad_norm": 14.696879386901855, + "learning_rate": 0.00019754297868854073, + "loss": 0.628, "step": 600 }, { - "epoch": 7.564263322884012, - "grad_norm": 0.11227085441350937, - "learning_rate": 0.00013780391555167992, - "loss": 0.0037, + "epoch": 1.612109115103127, + "grad_norm": 10.865748405456543, + "learning_rate": 0.0001974961526357664, + "loss": 0.7197, "step": 605 }, { - "epoch": 7.6269592476489025, - "grad_norm": 0.041259847581386566, - "learning_rate": 0.0001368722119352521, - "loss": 0.0149, + "epoch": 1.6254158349966734, + "grad_norm": 8.621575355529785, + "learning_rate": 0.00019744889023382215, + "loss": 1.1014, "step": 610 }, { - "epoch": 7.689655172413794, - "grad_norm": 41.212154388427734, - "learning_rate": 0.00013593678901379524, - "loss": 0.0205, + "epoch": 1.6387225548902196, + "grad_norm": 11.162896156311035, + "learning_rate": 0.00019740119169423337, + "loss": 0.9153, "step": 615 }, { - "epoch": 7.752351097178684, - "grad_norm": 12.241500854492188, - "learning_rate": 0.00013499774114354655, - "loss": 0.0038, + "epoch": 1.6520292747837657, + "grad_norm": 10.98604679107666, + "learning_rate": 0.00019735305723047732, + "loss": 1.0044, "step": 620 }, { - "epoch": 7.815047021943574, - "grad_norm": 55.03153991699219, - "learning_rate": 0.00013405516304639234, - "loss": 0.029, + "epoch": 1.665335994677312, + "grad_norm": 10.798745155334473, + "learning_rate": 0.00019730448705798239, + "loss": 1.2465, "step": 625 }, { - "epoch": 7.877742946708464, - "grad_norm": 0.5877093076705933, - "learning_rate": 0.00013310914980031334, - "loss": 0.0072, + "epoch": 1.6786427145708582, + "grad_norm": 7.607357978820801, + "learning_rate": 0.00019725548139412692, + "loss": 0.9162, "step": 630 }, { - "epoch": 7.940438871473354, - "grad_norm": 34.05458068847656, - "learning_rate": 0.00013215979682979428, - "loss": 0.0398, + "epoch": 1.6919494344644046, + "grad_norm": 8.403728485107422, + "learning_rate": 0.00019720604045823836, + "loss": 0.8765, "step": 635 }, { - "epoch": 8.0, - "grad_norm": 86.24744415283203, - "learning_rate": 0.00013120719989619833, - "loss": 0.1489, - "step": 640 - }, - { - "epoch": 8.0, - "eval_loss": 3.0941507816314697, - "eval_macro_f1": 73.75591198712105, - "eval_macro_precision": 74.97115819170575, - "eval_macro_recall": 73.52380952380952, - "eval_micro_f1": 73.52380952380952, - "eval_micro_precision": 73.52380952380952, - "eval_micro_recall": 73.52380952380952, - "eval_runtime": 4.8846, - "eval_samples_per_second": 214.962, - "eval_steps_per_second": 13.512, + "epoch": 1.7052561543579507, + "grad_norm": 8.099462509155273, + "learning_rate": 0.0001971561644715922, + "loss": 0.9799, "step": 640 }, { - "epoch": 8.06269592476489, - "grad_norm": 0.29403501749038696, - "learning_rate": 0.0001302514550881076, - "loss": 0.1469, + "epoch": 1.718562874251497, + "grad_norm": 10.743642807006836, + "learning_rate": 0.00019710585365741103, + "loss": 0.8505, "step": 645 }, { - "epoch": 8.12539184952978, - "grad_norm": 0.0023457759525626898, - "learning_rate": 0.0001292926588116308, - "loss": 0.0521, + "epoch": 1.7318695941450433, + "grad_norm": 6.268372535705566, + "learning_rate": 0.0001970551082408636, + "loss": 0.6592, "step": 650 }, { - "epoch": 8.18808777429467, - "grad_norm": 0.0002847153227776289, - "learning_rate": 0.00012833090778067863, - "loss": 0.0004, + "epoch": 1.7451763140385896, + "grad_norm": 11.54825210571289, + "learning_rate": 0.0001970039284490637, + "loss": 1.1281, "step": 655 }, { - "epoch": 8.25078369905956, - "grad_norm": 0.07453305274248123, - "learning_rate": 0.0001273662990072083, - "loss": 0.0002, + "epoch": 1.7584830339321358, + "grad_norm": 8.397130012512207, + "learning_rate": 0.00019695231451106912, + "loss": 0.8406, "step": 660 }, { - "epoch": 8.31347962382445, - "grad_norm": 0.015363110229372978, - "learning_rate": 0.00012639892979143789, - "loss": 0.0349, + "epoch": 1.771789753825682, + "grad_norm": 6.930296897888184, + "learning_rate": 0.00019690026665788085, + "loss": 0.6492, "step": 665 }, { - "epoch": 8.376175548589341, - "grad_norm": 0.003746453206986189, - "learning_rate": 0.00012542889771203166, - "loss": 0.0002, + "epoch": 1.785096473719228, + "grad_norm": 8.116912841796875, + "learning_rate": 0.00019684778512244172, + "loss": 0.8172, "step": 670 }, { - "epoch": 8.438871473354231, - "grad_norm": 0.07255641371011734, - "learning_rate": 0.00012445630061625729, - "loss": 0.0391, + "epoch": 1.7984031936127745, + "grad_norm": 8.120738983154297, + "learning_rate": 0.00019679487013963564, + "loss": 0.7236, "step": 675 }, { - "epoch": 8.501567398119121, - "grad_norm": 0.006769082974642515, - "learning_rate": 0.00012348123661011601, - "loss": 0.0626, + "epoch": 1.8117099135063208, + "grad_norm": 9.556026458740234, + "learning_rate": 0.00019674152194628638, + "loss": 1.0562, "step": 680 }, { - "epoch": 8.564263322884013, - "grad_norm": 1.8504444360733032, - "learning_rate": 0.00012250380404844665, - "loss": 0.1377, + "epoch": 1.825016633399867, + "grad_norm": 8.71633529663086, + "learning_rate": 0.00019668774078115665, + "loss": 0.6465, "step": 685 }, { - "epoch": 8.626959247648903, - "grad_norm": 30.328672409057617, - "learning_rate": 0.00012152410152500453, - "loss": 0.0094, + "epoch": 1.8383233532934131, + "grad_norm": 7.23399019241333, + "learning_rate": 0.00019663352688494684, + "loss": 0.7309, "step": 690 }, { - "epoch": 8.689655172413794, - "grad_norm": 0.01078982837498188, - "learning_rate": 0.00012054222786251634, - "loss": 0.0014, + "epoch": 1.8516300731869593, + "grad_norm": 10.535300254821777, + "learning_rate": 0.00019657888050029414, + "loss": 0.891, "step": 695 }, { - "epoch": 8.752351097178684, - "grad_norm": 0.02426566183567047, - "learning_rate": 0.00011955828210271187, - "loss": 0.0001, + "epoch": 1.8649367930805056, + "grad_norm": 6.320178985595703, + "learning_rate": 0.00019652380187177126, + "loss": 0.8396, "step": 700 }, { - "epoch": 8.815047021943574, - "grad_norm": 0.3005918562412262, - "learning_rate": 0.00011857236349633358, - "loss": 0.1305, + "epoch": 1.878243512974052, + "grad_norm": 7.133821487426758, + "learning_rate": 0.0001964682912458856, + "loss": 0.8449, "step": 705 }, { - "epoch": 8.877742946708464, - "grad_norm": 40.52849578857422, - "learning_rate": 0.00011758457149312538, - "loss": 0.1226, + "epoch": 1.8915502328675982, + "grad_norm": 9.44013786315918, + "learning_rate": 0.00019641234887107778, + "loss": 0.7616, "step": 710 }, { - "epoch": 8.940438871473354, - "grad_norm": 0.03640785068273544, - "learning_rate": 0.0001165950057318008, - "loss": 0.0081, + "epoch": 1.9048569527611443, + "grad_norm": 9.079183578491211, + "learning_rate": 0.00019635597499772093, + "loss": 1.027, "step": 715 }, { - "epoch": 9.0, - "grad_norm": 0.8364064693450928, - "learning_rate": 0.00011560376602999272, - "loss": 0.1558, - "step": 720 - }, - { - "epoch": 9.0, - "eval_loss": 3.2611937522888184, - "eval_macro_f1": 72.56355683169605, - "eval_macro_precision": 74.47633152198945, - "eval_macro_recall": 72.3809523809524, - "eval_micro_f1": 72.38095238095238, - "eval_micro_precision": 72.38095238095238, - "eval_micro_recall": 72.38095238095238, - "eval_runtime": 4.5879, - "eval_samples_per_second": 228.86, - "eval_steps_per_second": 14.386, + "epoch": 1.9181636726546905, + "grad_norm": 6.522233486175537, + "learning_rate": 0.00019629916987811926, + "loss": 1.0346, "step": 720 }, { - "epoch": 9.06269592476489, - "grad_norm": 0.017020411789417267, - "learning_rate": 0.00011461095237418454, - "loss": 0.0155, + "epoch": 1.9314703925482368, + "grad_norm": 12.074502944946289, + "learning_rate": 0.0001962419337665071, + "loss": 1.1165, "step": 725 }, { - "epoch": 9.12539184952978, - "grad_norm": 0.0012181233614683151, - "learning_rate": 0.00011361666490962468, - "loss": 0.0176, + "epoch": 1.9447771124417832, + "grad_norm": 6.275933265686035, + "learning_rate": 0.00019618426691904762, + "loss": 0.7029, "step": 730 }, { - "epoch": 9.18808777429467, - "grad_norm": 0.04490913078188896, - "learning_rate": 0.00011262100393022482, - "loss": 0.0084, + "epoch": 1.9580838323353293, + "grad_norm": 9.753368377685547, + "learning_rate": 0.0001961261695938319, + "loss": 0.9627, "step": 735 }, { - "epoch": 9.25078369905956, - "grad_norm": 0.0009038946591317654, - "learning_rate": 0.00011162406986844323, - "loss": 0.0, + "epoch": 1.9713905522288755, + "grad_norm": 8.683499336242676, + "learning_rate": 0.00019606764205087757, + "loss": 0.7202, "step": 740 }, { - "epoch": 9.31347962382445, - "grad_norm": 0.0007440209737978876, - "learning_rate": 0.00011062596328515424, - "loss": 0.0207, + "epoch": 1.9846972721224219, + "grad_norm": 8.230753898620605, + "learning_rate": 0.0001960086845521277, + "loss": 0.8854, "step": 745 }, { - "epoch": 9.376175548589341, - "grad_norm": 0.00041356749716214836, - "learning_rate": 0.00010962678485950455, - "loss": 0.0018, + "epoch": 1.998003992015968, + "grad_norm": 8.35472583770752, + "learning_rate": 0.00019594929736144976, + "loss": 1.194, "step": 750 }, { - "epoch": 9.438871473354231, - "grad_norm": 0.0054762535728514194, - "learning_rate": 0.00010862663537875775, - "loss": 0.0, + "epoch": 1.998003992015968, + "eval_loss": 0.36028650403022766, + "eval_macro_f1": 68.21512864198317, + "eval_macro_precision": 70.69761046501635, + "eval_macro_recall": 66.15824103590107, + "eval_micro_f1": 87.3502994011976, + "eval_micro_precision": 87.3502994011976, + "eval_micro_recall": 87.3502994011976, + "eval_runtime": 5.244, + "eval_samples_per_second": 509.537, + "eval_steps_per_second": 31.846, + "step": 750 + }, + { + "epoch": 2.013306719893546, + "grad_norm": 3.9793150424957275, + "learning_rate": 0.00019588948074463416, + "loss": 0.4671, "step": 755 }, { - "epoch": 9.501567398119121, - "grad_norm": 0.027165431529283524, - "learning_rate": 0.00010762561572812788, - "loss": 0.0, + "epoch": 2.0266134397870923, + "grad_norm": 5.490118503570557, + "learning_rate": 0.00019582923496939337, + "loss": 0.4299, "step": 760 }, { - "epoch": 9.564263322884013, - "grad_norm": 0.002276531420648098, - "learning_rate": 0.0001066238268806032, - "loss": 0.0102, + "epoch": 2.039920159680639, + "grad_norm": 11.58049201965332, + "learning_rate": 0.00019576856030536054, + "loss": 0.5549, "step": 765 }, { - "epoch": 9.626959247648903, - "grad_norm": 0.14534221589565277, - "learning_rate": 0.00010562136988676078, - "loss": 0.0001, + "epoch": 2.053226879574185, + "grad_norm": 12.648699760437012, + "learning_rate": 0.0001957074570240883, + "loss": 0.2281, "step": 770 }, { - "epoch": 9.689655172413794, - "grad_norm": 0.0010948021663352847, - "learning_rate": 0.00010461834586457398, - "loss": 0.0, + "epoch": 2.066533599467731, + "grad_norm": 7.2631754875183105, + "learning_rate": 0.0001956459253990476, + "loss": 0.35, "step": 775 }, { - "epoch": 9.752351097178684, - "grad_norm": 5.5901953601278365e-05, - "learning_rate": 0.00010361485598921212, - "loss": 0.104, + "epoch": 2.0798403193612773, + "grad_norm": 8.948901176452637, + "learning_rate": 0.0001955839657056265, + "loss": 0.3369, "step": 780 }, { - "epoch": 9.815047021943574, - "grad_norm": 0.8184991478919983, - "learning_rate": 0.00010261100148283538, - "loss": 0.0106, + "epoch": 2.0931470392548235, + "grad_norm": 4.452752113342285, + "learning_rate": 0.0001955215782211289, + "loss": 0.3592, "step": 785 }, { - "epoch": 9.877742946708464, - "grad_norm": 0.006693361327052116, - "learning_rate": 0.00010160688360438419, - "loss": 0.0, + "epoch": 2.10645375914837, + "grad_norm": 13.342286109924316, + "learning_rate": 0.0001954587632247732, + "loss": 0.5003, "step": 790 }, { - "epoch": 9.940438871473354, - "grad_norm": 59.32970428466797, - "learning_rate": 0.00010060260363936547, - "loss": 0.052, + "epoch": 2.1197604790419162, + "grad_norm": 5.800313472747803, + "learning_rate": 0.00019539552099769126, + "loss": 0.384, "step": 795 }, { - "epoch": 10.0, - "grad_norm": 0.021533485502004623, - "learning_rate": 9.959826288963571e-05, - "loss": 0.0, - "step": 800 - }, - { - "epoch": 10.0, - "eval_loss": 3.001224994659424, - "eval_macro_f1": 74.98933185346014, - "eval_macro_precision": 75.36484226701617, - "eval_macro_recall": 74.85714285714286, - "eval_micro_f1": 74.85714285714286, - "eval_micro_precision": 74.85714285714286, - "eval_micro_recall": 74.85714285714286, - "eval_runtime": 4.9316, - "eval_samples_per_second": 212.912, - "eval_steps_per_second": 13.383, + "epoch": 2.1330671989354624, + "grad_norm": 5.854800701141357, + "learning_rate": 0.00019533185182292703, + "loss": 0.3074, "step": 800 }, { - "epoch": 10.06269592476489, - "grad_norm": 0.10546494275331497, - "learning_rate": 9.85939626631829e-05, - "loss": 0.0001, + "epoch": 2.1463739188290085, + "grad_norm": 6.107956409454346, + "learning_rate": 0.00019526775598543522, + "loss": 0.3026, "step": 805 }, { - "epoch": 10.12539184952978, - "grad_norm": 0.0011004558764398098, - "learning_rate": 9.758980426390732e-05, - "loss": 0.0, + "epoch": 2.1596806387225547, + "grad_norm": 5.478348731994629, + "learning_rate": 0.00019520323377208017, + "loss": 0.3378, "step": 810 }, { - "epoch": 10.18808777429467, - "grad_norm": 0.02645757608115673, - "learning_rate": 9.658588898140322e-05, - "loss": 0.0, + "epoch": 2.1729873586161013, + "grad_norm": 8.135194778442383, + "learning_rate": 0.00019513828547163437, + "loss": 0.458, "step": 815 }, { - "epoch": 10.25078369905956, - "grad_norm": 0.06658319383859634, - "learning_rate": 9.558231808074156e-05, - "loss": 0.0001, + "epoch": 2.1862940785096474, + "grad_norm": 3.8288862705230713, + "learning_rate": 0.00019507291137477742, + "loss": 0.2756, "step": 820 }, { - "epoch": 10.31347962382445, - "grad_norm": 0.1821487843990326, - "learning_rate": 9.457919279225548e-05, - "loss": 0.0, + "epoch": 2.1996007984031936, + "grad_norm": 9.290178298950195, + "learning_rate": 0.00019500711177409454, + "loss": 0.2866, "step": 825 }, { - "epoch": 10.376175548589341, - "grad_norm": 0.0002453463093843311, - "learning_rate": 9.357661430132915e-05, - "loss": 0.0, + "epoch": 2.2129075182967397, + "grad_norm": 5.381500720977783, + "learning_rate": 0.00019494088696407532, + "loss": 0.2167, "step": 830 }, { - "epoch": 10.438871473354231, - "grad_norm": 0.0003151444543618709, - "learning_rate": 9.257468373819123e-05, - "loss": 0.0, + "epoch": 2.2262142381902863, + "grad_norm": 6.5513458251953125, + "learning_rate": 0.0001948742372411123, + "loss": 0.2532, "step": 835 }, { - "epoch": 10.501567398119121, - "grad_norm": 4.588846059050411e-05, - "learning_rate": 9.157350216771378e-05, - "loss": 0.0, + "epoch": 2.2395209580838324, + "grad_norm": 11.36906623840332, + "learning_rate": 0.00019480716290349995, + "loss": 0.3407, "step": 840 }, { - "epoch": 10.564263322884013, - "grad_norm": 0.0018681439105421305, - "learning_rate": 9.057317057921787e-05, - "loss": 0.0, + "epoch": 2.2528276779773786, + "grad_norm": 6.397517681121826, + "learning_rate": 0.00019473966425143292, + "loss": 0.2588, "step": 845 }, { - "epoch": 10.626959247648903, - "grad_norm": 0.0006441921577788889, - "learning_rate": 8.957378987628682e-05, - "loss": 0.0, + "epoch": 2.2661343978709247, + "grad_norm": 11.072936058044434, + "learning_rate": 0.00019467174158700504, + "loss": 0.2575, "step": 850 }, { - "epoch": 10.689655172413794, - "grad_norm": 0.0007152777980081737, - "learning_rate": 8.857546086658789e-05, - "loss": 0.0, + "epoch": 2.279441117764471, + "grad_norm": 0.4234752058982849, + "learning_rate": 0.00019460339521420772, + "loss": 0.1578, "step": 855 }, { - "epoch": 10.752351097178684, - "grad_norm": 0.0002644763153512031, - "learning_rate": 8.757828425170404e-05, - "loss": 0.0173, + "epoch": 2.2927478376580175, + "grad_norm": 52.92380905151367, + "learning_rate": 0.00019453462543892882, + "loss": 0.6933, "step": 860 }, { - "epoch": 10.815047021943574, - "grad_norm": 0.0004763313045259565, - "learning_rate": 8.658236061697586e-05, - "loss": 0.0, + "epoch": 2.3060545575515636, + "grad_norm": 18.455066680908203, + "learning_rate": 0.000194465432568951, + "loss": 0.3722, "step": 865 }, { - "epoch": 10.877742946708464, - "grad_norm": 0.07049086689949036, - "learning_rate": 8.55877904213558e-05, - "loss": 0.0, + "epoch": 2.31936127744511, + "grad_norm": 12.93152141571045, + "learning_rate": 0.00019439581691395067, + "loss": 0.3835, "step": 870 }, { - "epoch": 10.940438871473354, - "grad_norm": 0.00021191804262343794, - "learning_rate": 8.459467398727462e-05, - "loss": 0.0, + "epoch": 2.332667997338656, + "grad_norm": 5.5360212326049805, + "learning_rate": 0.00019432577878549637, + "loss": 0.339, "step": 875 }, { - "epoch": 11.0, - "grad_norm": 0.0024338788352906704, - "learning_rate": 8.360311149052205e-05, - "loss": 0.0, - "step": 880 - }, - { - "epoch": 11.0, - "eval_loss": 2.944465160369873, - "eval_macro_f1": 73.94965715163707, - "eval_macro_precision": 73.91418752481633, - "eval_macro_recall": 73.99999999999999, - "eval_micro_f1": 74.0, - "eval_micro_precision": 74.0, - "eval_micro_recall": 74.0, - "eval_runtime": 3.4955, - "eval_samples_per_second": 300.386, - "eval_steps_per_second": 18.881, + "epoch": 2.345974717232202, + "grad_norm": 6.381242752075195, + "learning_rate": 0.0001942553184970474, + "loss": 0.3283, "step": 880 }, { - "epoch": 11.06269592476489, - "grad_norm": 0.0007475401507690549, - "learning_rate": 8.261320295014182e-05, - "loss": 0.0, + "epoch": 2.3592814371257487, + "grad_norm": 7.571563720703125, + "learning_rate": 0.00019418443636395248, + "loss": 0.3261, "step": 885 }, { - "epoch": 11.12539184952978, - "grad_norm": 5.158692511031404e-05, - "learning_rate": 8.162504821834295e-05, - "loss": 0.0, + "epoch": 2.372588157019295, + "grad_norm": 2.1912384033203125, + "learning_rate": 0.00019411313270344837, + "loss": 0.3227, "step": 890 }, { - "epoch": 11.18808777429467, - "grad_norm": 0.0004298978892620653, - "learning_rate": 8.06387469704276e-05, - "loss": 0.0, + "epoch": 2.385894876912841, + "grad_norm": 4.941147804260254, + "learning_rate": 0.00019404140783465837, + "loss": 0.5089, "step": 895 }, { - "epoch": 11.25078369905956, - "grad_norm": 0.005391178652644157, - "learning_rate": 7.965439869473664e-05, - "loss": 0.0, + "epoch": 2.399201596806387, + "grad_norm": 6.276638507843018, + "learning_rate": 0.00019396926207859084, + "loss": 0.4068, "step": 900 }, { - "epoch": 11.31347962382445, - "grad_norm": 0.00016025469813030213, - "learning_rate": 7.867210268261439e-05, - "loss": 0.0, + "epoch": 2.4125083166999337, + "grad_norm": 4.92919397354126, + "learning_rate": 0.000193896695758138, + "loss": 0.4691, "step": 905 }, { - "epoch": 11.376175548589341, - "grad_norm": 0.00021810800535604358, - "learning_rate": 7.769195801839313e-05, - "loss": 0.0, + "epoch": 2.42581503659348, + "grad_norm": 6.015539169311523, + "learning_rate": 0.00019382370919807419, + "loss": 0.475, "step": 910 }, { - "epoch": 11.438871473354231, - "grad_norm": 0.0006552350823767483, - "learning_rate": 7.671406356939836e-05, - "loss": 0.0, + "epoch": 2.439121756487026, + "grad_norm": 16.059900283813477, + "learning_rate": 0.00019375030272505463, + "loss": 0.596, "step": 915 }, { - "epoch": 11.501567398119121, - "grad_norm": 0.001337722409516573, - "learning_rate": 7.573851797597602e-05, - "loss": 0.0, + "epoch": 2.452428476380572, + "grad_norm": 4.761904716491699, + "learning_rate": 0.00019367647666761385, + "loss": 0.6156, "step": 920 }, { - "epoch": 11.564263322884013, - "grad_norm": 0.00711465161293745, - "learning_rate": 7.476541964154269e-05, - "loss": 0.0, + "epoch": 2.4657351962741183, + "grad_norm": 11.512598037719727, + "learning_rate": 0.00019360223135616426, + "loss": 0.4051, "step": 925 }, { - "epoch": 11.626959247648903, - "grad_norm": 3.7509220419451594e-05, - "learning_rate": 7.379486672265964e-05, - "loss": 0.0, + "epoch": 2.479041916167665, + "grad_norm": 5.330412864685059, + "learning_rate": 0.00019352756712299468, + "loss": 0.2981, "step": 930 }, { - "epoch": 11.689655172413794, - "grad_norm": 0.00123989034909755, - "learning_rate": 7.28269571191317e-05, - "loss": 0.0, + "epoch": 2.492348636061211, + "grad_norm": 3.5500071048736572, + "learning_rate": 0.0001934524843022688, + "loss": 0.485, "step": 935 }, { - "epoch": 11.752351097178684, - "grad_norm": 0.00043695452040992677, - "learning_rate": 7.186178846413214e-05, - "loss": 0.0, + "epoch": 2.505655355954757, + "grad_norm": 6.358755111694336, + "learning_rate": 0.0001933769832300237, + "loss": 0.5665, "step": 940 }, { - "epoch": 11.815047021943574, - "grad_norm": 0.0020732777193188667, - "learning_rate": 7.089945811435433e-05, - "loss": 0.0, + "epoch": 2.5189620758483033, + "grad_norm": 12.199417114257812, + "learning_rate": 0.00019330106424416852, + "loss": 0.4757, "step": 945 }, { - "epoch": 11.877742946708464, - "grad_norm": 0.0006961887702345848, - "learning_rate": 6.994006314019141e-05, - "loss": 0.0, + "epoch": 2.5322687957418495, + "grad_norm": 11.047940254211426, + "learning_rate": 0.00019322472768448258, + "loss": 0.3742, "step": 950 }, { - "epoch": 11.940438871473354, - "grad_norm": 0.00014696276048198342, - "learning_rate": 6.898370031594487e-05, - "loss": 0.0, + "epoch": 2.545575515635396, + "grad_norm": 5.156216144561768, + "learning_rate": 0.00019314797389261424, + "loss": 0.4561, "step": 955 }, { - "epoch": 12.0, - "grad_norm": 0.004987122491002083, - "learning_rate": 6.803046611006278e-05, - "loss": 0.0, - "step": 960 - }, - { - "epoch": 12.0, - "eval_loss": 2.9617700576782227, - "eval_macro_f1": 74.48450982070439, - "eval_macro_precision": 74.49341191282451, - "eval_macro_recall": 74.47619047619047, - "eval_micro_f1": 74.47619047619047, - "eval_micro_precision": 74.47619047619047, - "eval_micro_recall": 74.47619047619047, - "eval_runtime": 3.4656, - "eval_samples_per_second": 302.974, - "eval_steps_per_second": 19.044, + "epoch": 2.5588822355289422, + "grad_norm": 5.482617378234863, + "learning_rate": 0.00019307080321207912, + "loss": 0.5187, "step": 960 }, { - "epoch": 12.06269592476489, - "grad_norm": 3.519210804370232e-05, - "learning_rate": 6.708045667540897e-05, - "loss": 0.0, + "epoch": 2.5721889554224884, + "grad_norm": 7.736295700073242, + "learning_rate": 0.00019299321598825866, + "loss": 0.4599, "step": 965 }, { - "epoch": 12.12539184952978, - "grad_norm": 0.0011250913375988603, - "learning_rate": 6.613376783956423e-05, - "loss": 0.0, + "epoch": 2.5854956753160345, + "grad_norm": 3.486142635345459, + "learning_rate": 0.00019291521256839858, + "loss": 0.2753, "step": 970 }, { - "epoch": 12.18808777429467, - "grad_norm": 5.886521466891281e-05, - "learning_rate": 6.519049509516013e-05, - "loss": 0.0, + "epoch": 2.5988023952095807, + "grad_norm": 4.562074184417725, + "learning_rate": 0.00019283679330160726, + "loss": 0.2733, "step": 975 }, { - "epoch": 12.25078369905956, - "grad_norm": 0.0004661143757402897, - "learning_rate": 6.425073359024663e-05, - "loss": 0.0, + "epoch": 2.6121091151031273, + "grad_norm": 13.783202171325684, + "learning_rate": 0.00019275795853885433, + "loss": 0.3718, "step": 980 }, { - "epoch": 12.31347962382445, - "grad_norm": 0.00029624096350744367, - "learning_rate": 6.331457811869437e-05, - "loss": 0.0, + "epoch": 2.6254158349966734, + "grad_norm": 9.088604927062988, + "learning_rate": 0.00019267870863296887, + "loss": 0.2797, "step": 985 }, { - "epoch": 12.376175548589341, - "grad_norm": 0.00012905469338875264, - "learning_rate": 6.2382123110633e-05, - "loss": 0.0, + "epoch": 2.6387225548902196, + "grad_norm": 10.746959686279297, + "learning_rate": 0.00019259904393863802, + "loss": 0.5173, "step": 990 }, { - "epoch": 12.438871473354231, - "grad_norm": 0.00013497307372745126, - "learning_rate": 6.145346262292595e-05, - "loss": 0.0, + "epoch": 2.6520292747837657, + "grad_norm": 29.163745880126953, + "learning_rate": 0.00019251896481240537, + "loss": 0.4255, "step": 995 }, { - "epoch": 12.501567398119121, - "grad_norm": 0.0008577414555475116, - "learning_rate": 6.052869032968285e-05, - "loss": 0.0, + "epoch": 2.665335994677312, + "grad_norm": 10.095052719116211, + "learning_rate": 0.0001924384716126692, + "loss": 0.6098, "step": 1000 }, { - "epoch": 12.564263322884013, - "grad_norm": 0.0001506131811765954, - "learning_rate": 5.960789951281052e-05, - "loss": 0.0, + "epoch": 2.6786427145708585, + "grad_norm": 4.459863185882568, + "learning_rate": 0.0001923575646996811, + "loss": 0.2524, "step": 1005 }, { - "epoch": 12.626959247648903, - "grad_norm": 0.00014009448932483792, - "learning_rate": 5.8691183052603834e-05, - "loss": 0.0, + "epoch": 2.6919494344644046, + "grad_norm": 8.43911361694336, + "learning_rate": 0.00019227624443554425, + "loss": 0.5336, "step": 1010 }, { - "epoch": 12.689655172413794, - "grad_norm": 2.588312781881541e-05, - "learning_rate": 5.777863341837675e-05, - "loss": 0.0, + "epoch": 2.7052561543579507, + "grad_norm": 7.737038612365723, + "learning_rate": 0.0001921945111842117, + "loss": 0.3688, "step": 1015 }, { - "epoch": 12.752351097178684, - "grad_norm": 0.00011443781841080636, - "learning_rate": 5.687034265913485e-05, - "loss": 0.0, + "epoch": 2.718562874251497, + "grad_norm": 10.94029712677002, + "learning_rate": 0.000192112365311485, + "loss": 0.4579, "step": 1020 }, { - "epoch": 12.815047021943574, - "grad_norm": 0.00010904014925472438, - "learning_rate": 5.596640239429051e-05, - "loss": 0.0, + "epoch": 2.731869594145043, + "grad_norm": 6.617719650268555, + "learning_rate": 0.0001920298071850123, + "loss": 0.1729, "step": 1025 }, { - "epoch": 12.877742946708464, - "grad_norm": 5.243328268988989e-05, - "learning_rate": 5.5066903804421025e-05, - "loss": 0.0, + "epoch": 2.7451763140385896, + "grad_norm": 15.398009300231934, + "learning_rate": 0.00019194683717428687, + "loss": 0.4216, "step": 1030 }, { - "epoch": 12.940438871473354, - "grad_norm": 0.0001976548373932019, - "learning_rate": 5.4171937622071435e-05, - "loss": 0.0, + "epoch": 2.758483033932136, + "grad_norm": 9.530508995056152, + "learning_rate": 0.00019186345565064535, + "loss": 0.5025, "step": 1035 }, { - "epoch": 13.0, - "grad_norm": 1.4458280929829925e-05, - "learning_rate": 5.32815941226022e-05, - "loss": 0.0, - "step": 1040 - }, - { - "epoch": 13.0, - "eval_loss": 2.9744179248809814, - "eval_macro_f1": 74.495166799055, - "eval_macro_precision": 74.51671347356447, - "eval_macro_recall": 74.47619047619047, - "eval_micro_f1": 74.47619047619047, - "eval_micro_precision": 74.47619047619047, - "eval_micro_recall": 74.47619047619047, - "eval_runtime": 3.7357, - "eval_samples_per_second": 281.069, - "eval_steps_per_second": 17.667, + "epoch": 2.771789753825682, + "grad_norm": 7.099113464355469, + "learning_rate": 0.00019177966298726613, + "loss": 0.3395, "step": 1040 }, { - "epoch": 13.06269592476489, - "grad_norm": 0.00015813493519090116, - "learning_rate": 5.2395963115083104e-05, - "loss": 0.0, + "epoch": 2.785096473719228, + "grad_norm": 15.377814292907715, + "learning_rate": 0.0001916954595591677, + "loss": 0.579, "step": 1045 }, { - "epoch": 13.12539184952978, - "grad_norm": 5.232647708908189e-06, - "learning_rate": 5.151513393323426e-05, - "loss": 0.0, + "epoch": 2.7984031936127742, + "grad_norm": 3.2470192909240723, + "learning_rate": 0.00019161084574320696, + "loss": 0.5129, "step": 1050 }, { - "epoch": 13.18808777429467, - "grad_norm": 0.0007055670721456409, - "learning_rate": 5.06391954264149e-05, - "loss": 0.0, + "epoch": 2.811709913506321, + "grad_norm": 8.268476486206055, + "learning_rate": 0.0001915258219180775, + "loss": 0.4809, "step": 1055 }, { - "epoch": 13.25078369905956, - "grad_norm": 0.0001715083053568378, - "learning_rate": 4.976823595066128e-05, - "loss": 0.0, + "epoch": 2.825016633399867, + "grad_norm": 23.91556739807129, + "learning_rate": 0.0001914403884643079, + "loss": 0.566, "step": 1060 }, { - "epoch": 13.31347962382445, - "grad_norm": 0.0010762359015643597, - "learning_rate": 4.8902343359774085e-05, - "loss": 0.0, + "epoch": 2.838323353293413, + "grad_norm": 5.273834228515625, + "learning_rate": 0.0001913545457642601, + "loss": 0.4376, "step": 1065 }, { - "epoch": 13.376175548589341, - "grad_norm": 0.0003584644291549921, - "learning_rate": 4.804160499645667e-05, - "loss": 0.0, + "epoch": 2.8516300731869593, + "grad_norm": 16.142292022705078, + "learning_rate": 0.00019126829420212764, + "loss": 0.2264, "step": 1070 }, { - "epoch": 13.438871473354231, - "grad_norm": 0.00013147966819815338, - "learning_rate": 4.7186107683504656e-05, - "loss": 0.0, + "epoch": 2.8649367930805054, + "grad_norm": 26.231626510620117, + "learning_rate": 0.00019118163416393392, + "loss": 0.3179, "step": 1075 }, { - "epoch": 13.501567398119121, - "grad_norm": 0.00026007898850366473, - "learning_rate": 4.6335937715048306e-05, - "loss": 0.0, + "epoch": 2.878243512974052, + "grad_norm": 6.147984981536865, + "learning_rate": 0.0001910945660375305, + "loss": 0.2849, "step": 1080 }, { - "epoch": 13.564263322884013, - "grad_norm": 8.957670797826722e-05, - "learning_rate": 4.549118084784788e-05, - "loss": 0.0, + "epoch": 2.891550232867598, + "grad_norm": 17.026046752929688, + "learning_rate": 0.0001910070902125954, + "loss": 0.5235, "step": 1085 }, { - "epoch": 13.626959247648903, - "grad_norm": 7.390981409116648e-06, - "learning_rate": 4.465192229264337e-05, - "loss": 0.0, + "epoch": 2.9048569527611443, + "grad_norm": 10.020215034484863, + "learning_rate": 0.0001909192070806313, + "loss": 0.3592, "step": 1090 }, { - "epoch": 13.689655172413794, - "grad_norm": 0.0010093646124005318, - "learning_rate": 4.381824670555934e-05, - "loss": 0.0, + "epoch": 2.9181636726546905, + "grad_norm": 12.995983123779297, + "learning_rate": 0.0001908309170349637, + "loss": 0.4643, "step": 1095 }, { - "epoch": 13.752351097178684, - "grad_norm": 0.0001321593881584704, - "learning_rate": 4.29902381795655e-05, - "loss": 0.0, + "epoch": 2.9314703925482366, + "grad_norm": 8.028570175170898, + "learning_rate": 0.00019074222047073947, + "loss": 0.6559, "step": 1100 }, { - "epoch": 13.815047021943574, - "grad_norm": 0.0003493933181744069, - "learning_rate": 4.216798023599441e-05, - "loss": 0.0, + "epoch": 2.944777112441783, + "grad_norm": 7.6127471923828125, + "learning_rate": 0.00019065311778492468, + "loss": 0.5201, "step": 1105 }, { - "epoch": 13.877742946708464, - "grad_norm": 0.00017224319162778556, - "learning_rate": 4.135155581611661e-05, - "loss": 0.0, + "epoch": 2.9580838323353293, + "grad_norm": 2.081876039505005, + "learning_rate": 0.0001905636093763031, + "loss": 0.2437, "step": 1110 }, { - "epoch": 13.940438871473354, - "grad_norm": 0.0007415884756483138, - "learning_rate": 4.0541047272774315e-05, - "loss": 0.0, + "epoch": 2.9713905522288755, + "grad_norm": 20.373952865600586, + "learning_rate": 0.00019047369564547436, + "loss": 0.8751, "step": 1115 }, { - "epoch": 14.0, - "grad_norm": 0.00015337667718995363, - "learning_rate": 3.973653636207437e-05, - "loss": 0.0, + "epoch": 2.984697272122422, + "grad_norm": 11.040226936340332, + "learning_rate": 0.00019038337699485208, + "loss": 0.2046, "step": 1120 }, { - "epoch": 14.0, - "eval_loss": 2.978665351867676, - "eval_macro_f1": 74.39737250761942, - "eval_macro_precision": 74.41561381963534, - "eval_macro_recall": 74.38095238095238, - "eval_micro_f1": 74.38095238095238, - "eval_micro_precision": 74.38095238095238, - "eval_micro_recall": 74.38095238095238, - "eval_runtime": 4.1748, - "eval_samples_per_second": 251.508, - "eval_steps_per_second": 15.809, - "step": 1120 + "epoch": 2.998003992015968, + "grad_norm": 14.944093704223633, + "learning_rate": 0.00019029265382866214, + "loss": 0.5185, + "step": 1125 }, { - "epoch": 14.06269592476489, - "grad_norm": 0.00025141274090856314, - "learning_rate": 3.893810423514172e-05, - "loss": 0.0, + "epoch": 2.998003992015968, + "eval_loss": 0.556496798992157, + "eval_macro_f1": 67.9724393832533, + "eval_macro_precision": 66.8065205424496, + "eval_macro_recall": 69.37092559087907, + "eval_micro_f1": 86.00299401197606, + "eval_micro_precision": 86.00299401197606, + "eval_micro_recall": 86.00299401197606, + "eval_runtime": 4.5031, + "eval_samples_per_second": 593.365, + "eval_steps_per_second": 37.085, "step": 1125 }, { - "epoch": 14.12539184952978, - "grad_norm": 0.0002585667825769633, - "learning_rate": 3.814583142993352e-05, - "loss": 0.0, + "epoch": 3.013306719893546, + "grad_norm": 7.3535308837890625, + "learning_rate": 0.00019020152655294085, + "loss": 0.4967, "step": 1130 }, { - "epoch": 14.18808777429467, - "grad_norm": 1.575879105075728e-05, - "learning_rate": 3.7359797863115283e-05, - "loss": 0.0, + "epoch": 3.0266134397870923, + "grad_norm": 7.688624382019043, + "learning_rate": 0.00019010999557553317, + "loss": 0.2367, "step": 1135 }, { - "epoch": 14.25078369905956, - "grad_norm": 3.0083991077844985e-05, - "learning_rate": 3.6580082821999786e-05, - "loss": 0.0, + "epoch": 3.039920159680639, + "grad_norm": 7.046450138092041, + "learning_rate": 0.0001900180613060908, + "loss": 0.1927, "step": 1140 }, { - "epoch": 14.31347962382445, - "grad_norm": 0.0006605722010135651, - "learning_rate": 3.580676495654911e-05, - "loss": 0.0, + "epoch": 3.053226879574185, + "grad_norm": 3.205350637435913, + "learning_rate": 0.0001899257241560704, + "loss": 0.1934, "step": 1145 }, { - "epoch": 14.376175548589341, - "grad_norm": 0.00018690273282118142, - "learning_rate": 3.503992227144147e-05, - "loss": 0.0, + "epoch": 3.066533599467731, + "grad_norm": 17.599550247192383, + "learning_rate": 0.0001898329845387317, + "loss": 0.2969, "step": 1150 }, { - "epoch": 14.438871473354231, - "grad_norm": 0.00042844025301747024, - "learning_rate": 3.427963211820274e-05, - "loss": 0.0, + "epoch": 3.0798403193612773, + "grad_norm": 30.870227813720703, + "learning_rate": 0.00018973984286913584, + "loss": 0.3991, "step": 1155 }, { - "epoch": 14.501567398119121, - "grad_norm": 0.00033945144969038665, - "learning_rate": 3.352597118740404e-05, - "loss": 0.0, + "epoch": 3.0931470392548235, + "grad_norm": 1.632244348526001, + "learning_rate": 0.0001896462995641432, + "loss": 0.3399, "step": 1160 }, { - "epoch": 14.564263322884013, - "grad_norm": 5.347471233108081e-05, - "learning_rate": 3.277901550092581e-05, - "loss": 0.0, + "epoch": 3.10645375914837, + "grad_norm": 1.5939713716506958, + "learning_rate": 0.00018955235504241187, + "loss": 0.2284, "step": 1165 }, { - "epoch": 14.626959247648903, - "grad_norm": 0.0009476160048507154, - "learning_rate": 3.2038840404289705e-05, - "loss": 0.0, + "epoch": 3.1197604790419162, + "grad_norm": 5.369167327880859, + "learning_rate": 0.00018945800972439538, + "loss": 0.2094, "step": 1170 }, { - "epoch": 14.689655172413794, - "grad_norm": 0.0006543208146467805, - "learning_rate": 3.13055205590583e-05, - "loss": 0.0, + "epoch": 3.1330671989354624, + "grad_norm": 10.1832914352417, + "learning_rate": 0.00018936326403234125, + "loss": 0.1946, "step": 1175 }, { - "epoch": 14.752351097178684, - "grad_norm": 6.901170854689553e-05, - "learning_rate": 3.0579129935304066e-05, - "loss": 0.0, + "epoch": 3.1463739188290085, + "grad_norm": 0.6386124491691589, + "learning_rate": 0.00018926811839028876, + "loss": 0.1098, "step": 1180 }, { - "epoch": 14.815047021943574, - "grad_norm": 0.0006354337092489004, - "learning_rate": 2.9859741804147957e-05, - "loss": 0.0, + "epoch": 3.1596806387225547, + "grad_norm": 12.72846508026123, + "learning_rate": 0.00018917257322406734, + "loss": 0.1211, "step": 1185 }, { - "epoch": 14.877742946708464, - "grad_norm": 0.0009218246559612453, - "learning_rate": 2.9147428730368475e-05, - "loss": 0.0, + "epoch": 3.1729873586161013, + "grad_norm": 12.194605827331543, + "learning_rate": 0.00018907662896129433, + "loss": 0.4173, "step": 1190 }, { - "epoch": 14.940438871473354, - "grad_norm": 0.0008917151717469096, - "learning_rate": 2.844226256508221e-05, - "loss": 0.0, + "epoch": 3.1862940785096474, + "grad_norm": 4.248966217041016, + "learning_rate": 0.00018898028603137341, + "loss": 0.1053, "step": 1195 }, { - "epoch": 15.0, - "grad_norm": 4.064366657985374e-05, - "learning_rate": 2.7744314438496088e-05, - "loss": 0.0, - "step": 1200 - }, - { - "epoch": 15.0, - "eval_loss": 2.9775209426879883, - "eval_macro_f1": 74.59288643616037, - "eval_macro_precision": 74.61805414620287, - "eval_macro_recall": 74.57142857142857, - "eval_micro_f1": 74.57142857142857, - "eval_micro_precision": 74.57142857142857, - "eval_micro_recall": 74.57142857142857, - "eval_runtime": 4.2475, - "eval_samples_per_second": 247.204, - "eval_steps_per_second": 15.539, + "epoch": 3.1996007984031936, + "grad_norm": 3.8942484855651855, + "learning_rate": 0.00018888354486549237, + "loss": 0.1792, "step": 1200 }, { - "epoch": 15.06269592476489, - "grad_norm": 0.0005438003572635353, - "learning_rate": 2.7053654752732528e-05, - "loss": 0.0, + "epoch": 3.2129075182967397, + "grad_norm": 5.4043755531311035, + "learning_rate": 0.0001887864058966214, + "loss": 0.2308, "step": 1205 }, { - "epoch": 15.12539184952978, - "grad_norm": 6.86753774061799e-05, - "learning_rate": 2.6370353174727836e-05, - "loss": 0.0, + "epoch": 3.2262142381902863, + "grad_norm": 23.204421997070312, + "learning_rate": 0.00018868886955951115, + "loss": 0.2898, "step": 1210 }, { - "epoch": 15.18808777429467, - "grad_norm": 0.00032200937857851386, - "learning_rate": 2.5694478629205078e-05, - "loss": 0.0, + "epoch": 3.2395209580838324, + "grad_norm": 19.77983856201172, + "learning_rate": 0.00018859093629069058, + "loss": 0.1953, "step": 1215 }, { - "epoch": 15.25078369905956, - "grad_norm": 0.0002235895226476714, - "learning_rate": 2.5026099291721516e-05, - "loss": 0.0, + "epoch": 3.2528276779773786, + "grad_norm": 10.021352767944336, + "learning_rate": 0.00018849260652846519, + "loss": 0.3812, "step": 1220 }, { - "epoch": 15.31347962382445, - "grad_norm": 0.00034938243334181607, - "learning_rate": 2.4365282581791782e-05, - "loss": 0.0, + "epoch": 3.2661343978709247, + "grad_norm": 3.085806131362915, + "learning_rate": 0.00018839388071291503, + "loss": 0.1707, "step": 1225 }, { - "epoch": 15.376175548589341, - "grad_norm": 0.00023688429791945964, - "learning_rate": 2.371209515608718e-05, - "loss": 0.0, + "epoch": 3.279441117764471, + "grad_norm": 1.3998228311538696, + "learning_rate": 0.00018829475928589271, + "loss": 0.1516, "step": 1230 }, { - "epoch": 15.438871473354231, - "grad_norm": 0.0008163132588379085, - "learning_rate": 2.3066602901712108e-05, - "loss": 0.0, + "epoch": 3.2927478376580175, + "grad_norm": 6.1686835289001465, + "learning_rate": 0.00018819524269102136, + "loss": 0.3816, "step": 1235 }, { - "epoch": 15.501567398119121, - "grad_norm": 0.00019654417701531202, - "learning_rate": 2.242887092955801e-05, - "loss": 0.0, + "epoch": 3.3060545575515636, + "grad_norm": 6.946842670440674, + "learning_rate": 0.0001880953313736928, + "loss": 0.1497, "step": 1240 }, { - "epoch": 15.564263322884013, - "grad_norm": 2.644214873726014e-05, - "learning_rate": 2.1798963567735608e-05, - "loss": 0.0, + "epoch": 3.31936127744511, + "grad_norm": 3.2674038410186768, + "learning_rate": 0.00018799502578106534, + "loss": 0.1253, "step": 1245 }, { - "epoch": 15.626959247648903, - "grad_norm": 0.0005443547270260751, - "learning_rate": 2.1176944355086058e-05, - "loss": 0.0, + "epoch": 3.332667997338656, + "grad_norm": 4.695175647735596, + "learning_rate": 0.00018789432636206197, + "loss": 0.1789, "step": 1250 }, { - "epoch": 15.689655172413794, - "grad_norm": 0.00014729479153174907, - "learning_rate": 2.0562876034771882e-05, - "loss": 0.0, + "epoch": 3.345974717232202, + "grad_norm": 14.17712116241455, + "learning_rate": 0.00018779323356736826, + "loss": 0.1984, "step": 1255 }, { - "epoch": 15.752351097178684, - "grad_norm": 0.0004818035813514143, - "learning_rate": 1.995682054794803e-05, - "loss": 0.0, + "epoch": 3.3592814371257487, + "grad_norm": 9.122124671936035, + "learning_rate": 0.0001876917478494303, + "loss": 0.175, "step": 1260 }, { - "epoch": 15.815047021943574, - "grad_norm": 0.0002842153771780431, - "learning_rate": 1.935883902751382e-05, - "loss": 0.0, + "epoch": 3.372588157019295, + "grad_norm": 5.063784122467041, + "learning_rate": 0.00018758986966245283, + "loss": 0.2501, "step": 1265 }, { - "epoch": 15.877742946708464, - "grad_norm": 0.00016755808610469103, - "learning_rate": 1.8768991791946456e-05, - "loss": 0.0, + "epoch": 3.385894876912841, + "grad_norm": 8.168728828430176, + "learning_rate": 0.000187487599462397, + "loss": 0.2839, "step": 1270 }, { - "epoch": 15.940438871473354, - "grad_norm": 0.0001340256567345932, - "learning_rate": 1.8187338339216775e-05, - "loss": 0.0, + "epoch": 3.399201596806387, + "grad_norm": 6.399585247039795, + "learning_rate": 0.00018738493770697852, + "loss": 0.2172, "step": 1275 }, { - "epoch": 16.0, - "grad_norm": 0.00031726626912131906, - "learning_rate": 1.76139373407876e-05, - "loss": 0.0, - "step": 1280 - }, - { - "epoch": 16.0, - "eval_loss": 2.9834632873535156, - "eval_macro_f1": 74.30509548505665, - "eval_macro_precision": 74.3270405312326, - "eval_macro_recall": 74.28571428571429, - "eval_micro_f1": 74.28571428571429, - "eval_micro_precision": 74.28571428571429, - "eval_micro_recall": 74.28571428571429, - "eval_runtime": 5.3043, - "eval_samples_per_second": 197.954, - "eval_steps_per_second": 12.443, + "epoch": 3.4125083166999337, + "grad_norm": 3.3792059421539307, + "learning_rate": 0.00018728188485566544, + "loss": 0.3605, "step": 1280 }, { - "epoch": 16.062695924764892, - "grad_norm": 0.0002320996136404574, - "learning_rate": 1.7048846635695602e-05, - "loss": 0.0, + "epoch": 3.42581503659348, + "grad_norm": 2.7264702320098877, + "learning_rate": 0.00018717844136967624, + "loss": 0.3634, "step": 1285 }, { - "epoch": 16.12539184952978, - "grad_norm": 0.0005247556255199015, - "learning_rate": 1.649212322471695e-05, - "loss": 0.0, + "epoch": 3.439121756487026, + "grad_norm": 28.8140811920166, + "learning_rate": 0.00018707460771197774, + "loss": 0.2858, "step": 1290 }, { - "epoch": 16.188087774294672, - "grad_norm": 6.593632861040533e-05, - "learning_rate": 1.5943823264617796e-05, - "loss": 0.0, + "epoch": 3.452428476380572, + "grad_norm": 11.220648765563965, + "learning_rate": 0.0001869703843472829, + "loss": 0.2592, "step": 1295 }, { - "epoch": 16.25078369905956, - "grad_norm": 2.0959419998689555e-05, - "learning_rate": 1.540400206248963e-05, - "loss": 0.0, + "epoch": 3.4657351962741183, + "grad_norm": 6.689227104187012, + "learning_rate": 0.00018686577174204885, + "loss": 0.5982, "step": 1300 }, { - "epoch": 16.313479623824453, - "grad_norm": 0.0002277921448694542, - "learning_rate": 1.4872714070170468e-05, - "loss": 0.0, + "epoch": 3.479041916167665, + "grad_norm": 9.754223823547363, + "learning_rate": 0.00018676077036447494, + "loss": 0.1692, "step": 1305 }, { - "epoch": 16.37617554858934, - "grad_norm": 5.7650511735118926e-05, - "learning_rate": 1.435001287875234e-05, - "loss": 0.0, + "epoch": 3.492348636061211, + "grad_norm": 8.857412338256836, + "learning_rate": 0.00018665538068450023, + "loss": 0.6272, "step": 1310 }, { - "epoch": 16.438871473354233, - "grad_norm": 0.0003665912081487477, - "learning_rate": 1.3835951213175413e-05, - "loss": 0.0, + "epoch": 3.505655355954757, + "grad_norm": 11.387353897094727, + "learning_rate": 0.00018654960317380189, + "loss": 0.3136, "step": 1315 }, { - "epoch": 16.50156739811912, - "grad_norm": 0.00018977168656419963, - "learning_rate": 1.3330580926909763e-05, - "loss": 0.0, + "epoch": 3.5189620758483033, + "grad_norm": 12.20112419128418, + "learning_rate": 0.0001864434383057927, + "loss": 0.2563, "step": 1320 }, { - "epoch": 16.564263322884013, - "grad_norm": 2.3261072783498093e-05, - "learning_rate": 1.2833952996724863e-05, - "loss": 0.0, + "epoch": 3.5322687957418495, + "grad_norm": 5.7031121253967285, + "learning_rate": 0.0001863368865556191, + "loss": 0.2048, "step": 1325 }, { - "epoch": 16.6269592476489, - "grad_norm": 0.00015886298206169158, - "learning_rate": 1.2346117517547551e-05, - "loss": 0.0, + "epoch": 3.545575515635396, + "grad_norm": 36.38295364379883, + "learning_rate": 0.0001862299484001591, + "loss": 0.3182, "step": 1330 }, { - "epoch": 16.689655172413794, - "grad_norm": 0.00010217857925454155, - "learning_rate": 1.1867123697408854e-05, - "loss": 0.0, + "epoch": 3.5588822355289422, + "grad_norm": 0.3014439642429352, + "learning_rate": 0.00018612262431802007, + "loss": 0.1106, "step": 1335 }, { - "epoch": 16.752351097178682, - "grad_norm": 0.000697963812854141, - "learning_rate": 1.139701985248055e-05, - "loss": 0.0, + "epoch": 3.5721889554224884, + "grad_norm": 10.478448867797852, + "learning_rate": 0.00018601491478953657, + "loss": 0.3801, "step": 1340 }, { - "epoch": 16.815047021943574, - "grad_norm": 0.0002654240815900266, - "learning_rate": 1.0935853402201335e-05, - "loss": 0.0, + "epoch": 3.5854956753160345, + "grad_norm": 20.970182418823242, + "learning_rate": 0.00018590682029676823, + "loss": 0.2131, "step": 1345 }, { - "epoch": 16.877742946708462, - "grad_norm": 0.0005684046191163361, - "learning_rate": 1.0483670864493778e-05, - "loss": 0.0, + "epoch": 3.5988023952095807, + "grad_norm": 16.805198669433594, + "learning_rate": 0.00018579834132349772, + "loss": 0.4117, "step": 1350 }, { - "epoch": 16.940438871473354, - "grad_norm": 0.00042359635699540377, - "learning_rate": 1.004051785107184e-05, - "loss": 0.0, + "epoch": 3.6121091151031273, + "grad_norm": 6.421359539031982, + "learning_rate": 0.00018568947835522837, + "loss": 0.269, "step": 1355 }, { - "epoch": 17.0, - "grad_norm": 8.337834879057482e-05, - "learning_rate": 9.606439062840256e-06, - "loss": 0.0, - "step": 1360 - }, - { - "epoch": 17.0, - "eval_loss": 2.983590602874756, - "eval_macro_f1": 74.48693497595121, - "eval_macro_precision": 74.49848858038513, - "eval_macro_recall": 74.47619047619048, - "eval_micro_f1": 74.47619047619047, - "eval_micro_precision": 74.47619047619047, - "eval_micro_recall": 74.47619047619047, - "eval_runtime": 5.1696, - "eval_samples_per_second": 203.111, - "eval_steps_per_second": 12.767, + "epoch": 3.6254158349966734, + "grad_norm": 0.5675371885299683, + "learning_rate": 0.0001855802318791821, + "loss": 0.2104, "step": 1360 }, { - "epoch": 17.062695924764892, - "grad_norm": 0.0003075774875469506, - "learning_rate": 9.181478285385381e-06, - "loss": 0.0, + "epoch": 3.6387225548902196, + "grad_norm": 5.67156982421875, + "learning_rate": 0.00018547060238429736, + "loss": 0.248, "step": 1365 }, { - "epoch": 17.12539184952978, - "grad_norm": 8.384697139263153e-05, - "learning_rate": 8.765678384558607e-06, - "loss": 0.0, + "epoch": 3.6520292747837657, + "grad_norm": 5.464066982269287, + "learning_rate": 0.00018536059036122667, + "loss": 0.2855, "step": 1370 }, { - "epoch": 17.188087774294672, - "grad_norm": 0.00010231971828034148, - "learning_rate": 8.359081302152394e-06, - "loss": 0.0, + "epoch": 3.665335994677312, + "grad_norm": 2.300774097442627, + "learning_rate": 0.00018525019630233463, + "loss": 0.1769, "step": 1375 }, { - "epoch": 17.25078369905956, - "grad_norm": 0.0003627596015576273, - "learning_rate": 7.961728051669737e-06, - "loss": 0.0, + "epoch": 3.6786427145708585, + "grad_norm": 4.122674942016602, + "learning_rate": 0.0001851394207016957, + "loss": 0.3247, "step": 1380 }, { - "epoch": 17.313479623824453, - "grad_norm": 3.662167000584304e-05, - "learning_rate": 7.5736587141870155e-06, - "loss": 0.0, + "epoch": 3.6919494344644046, + "grad_norm": 4.85117244720459, + "learning_rate": 0.0001850282640550919, + "loss": 0.3428, "step": 1385 }, { - "epoch": 17.37617554858934, - "grad_norm": 0.0005837052594870329, - "learning_rate": 7.194912434311052e-06, - "loss": 0.0, + "epoch": 3.7052561543579507, + "grad_norm": 5.2849602699279785, + "learning_rate": 0.00018491672686001066, + "loss": 0.1914, "step": 1390 }, { - "epoch": 17.438871473354233, - "grad_norm": 7.577840005978942e-05, - "learning_rate": 6.8255274162305374e-06, - "loss": 0.0, + "epoch": 3.718562874251497, + "grad_norm": 4.865028381347656, + "learning_rate": 0.0001848048096156426, + "loss": 0.2503, "step": 1395 }, { - "epoch": 17.50156739811912, - "grad_norm": 0.00022036675363779068, - "learning_rate": 6.465540919862456e-06, - "loss": 0.0, + "epoch": 3.731869594145043, + "grad_norm": 5.3449482917785645, + "learning_rate": 0.0001846925128228792, + "loss": 0.2556, "step": 1400 }, { - "epoch": 17.564263322884013, - "grad_norm": 7.557481876574457e-05, - "learning_rate": 6.11498925709364e-06, - "loss": 0.0, + "epoch": 3.7451763140385896, + "grad_norm": 5.978095531463623, + "learning_rate": 0.00018457983698431075, + "loss": 0.0786, "step": 1405 }, { - "epoch": 17.6269592476489, - "grad_norm": 0.00023109870380721986, - "learning_rate": 5.77390778811796e-06, - "loss": 0.0, + "epoch": 3.758483033932136, + "grad_norm": 19.654855728149414, + "learning_rate": 0.00018446678260422385, + "loss": 0.4378, "step": 1410 }, { - "epoch": 17.689655172413794, - "grad_norm": 0.0002563658345025033, - "learning_rate": 5.44233091786951e-06, - "loss": 0.0, + "epoch": 3.771789753825682, + "grad_norm": 14.306594848632812, + "learning_rate": 0.0001843533501885993, + "loss": 0.4566, "step": 1415 }, { - "epoch": 17.752351097178682, - "grad_norm": 1.977133797481656e-05, - "learning_rate": 5.12029209255227e-06, - "loss": 0.0, + "epoch": 3.785096473719228, + "grad_norm": 17.783693313598633, + "learning_rate": 0.00018423954024510996, + "loss": 0.1464, "step": 1420 }, { - "epoch": 17.815047021943574, - "grad_norm": 0.00025862394249998033, - "learning_rate": 4.807823796266331e-06, - "loss": 0.0, + "epoch": 3.7984031936127742, + "grad_norm": 3.0436503887176514, + "learning_rate": 0.00018412535328311814, + "loss": 0.2691, "step": 1425 }, { - "epoch": 17.877742946708462, - "grad_norm": 0.00048169083311222494, - "learning_rate": 4.504957547731214e-06, - "loss": 0.0, + "epoch": 3.811709913506321, + "grad_norm": 18.622018814086914, + "learning_rate": 0.00018401078981367363, + "loss": 0.4671, "step": 1430 }, { - "epoch": 17.940438871473354, - "grad_norm": 1.255560255231103e-05, - "learning_rate": 4.211723897106534e-06, - "loss": 0.0, + "epoch": 3.825016633399867, + "grad_norm": 7.90670108795166, + "learning_rate": 0.0001838958503495113, + "loss": 0.3429, "step": 1435 }, { - "epoch": 18.0, - "grad_norm": 0.0004092851304449141, - "learning_rate": 3.928152422910491e-06, - "loss": 0.0, - "step": 1440 - }, - { - "epoch": 18.0, - "eval_loss": 2.982109308242798, - "eval_macro_f1": 74.49440308940996, - "eval_macro_precision": 74.5153120821081, - "eval_macro_recall": 74.47619047619047, - "eval_micro_f1": 74.47619047619047, - "eval_micro_precision": 74.47619047619047, - "eval_micro_recall": 74.47619047619047, - "eval_runtime": 5.0341, - "eval_samples_per_second": 208.578, - "eval_steps_per_second": 13.111, + "epoch": 3.838323353293413, + "grad_norm": 4.488195419311523, + "learning_rate": 0.00018378053540504873, + "loss": 0.3645, "step": 1440 }, { - "epoch": 18.062695924764892, - "grad_norm": 9.531569230603054e-05, - "learning_rate": 3.6542717290362515e-06, - "loss": 0.0, + "epoch": 3.8516300731869593, + "grad_norm": 8.05339241027832, + "learning_rate": 0.0001836648454963841, + "loss": 0.2885, "step": 1445 }, { - "epoch": 18.12539184952978, - "grad_norm": 5.50920121895615e-05, - "learning_rate": 3.390109441866618e-06, - "loss": 0.0, + "epoch": 3.8649367930805054, + "grad_norm": 3.8606925010681152, + "learning_rate": 0.00018354878114129367, + "loss": 0.2617, "step": 1450 }, { - "epoch": 18.188087774294672, - "grad_norm": 0.00010016823216574267, - "learning_rate": 3.135692207487373e-06, - "loss": 0.0, + "epoch": 3.878243512974052, + "grad_norm": 7.334080696105957, + "learning_rate": 0.00018343234285922953, + "loss": 0.2205, "step": 1455 }, { - "epoch": 18.25078369905956, - "grad_norm": 0.0006045085028745234, - "learning_rate": 2.8910456889995498e-06, - "loss": 0.0, + "epoch": 3.891550232867598, + "grad_norm": 10.274678230285645, + "learning_rate": 0.0001833155311713174, + "loss": 0.2675, "step": 1460 }, { - "epoch": 18.313479623824453, - "grad_norm": 0.00010283043229719624, - "learning_rate": 2.656194563930714e-06, - "loss": 0.0, + "epoch": 3.9048569527611443, + "grad_norm": 4.16168212890625, + "learning_rate": 0.00018319834660035413, + "loss": 0.1654, "step": 1465 }, { - "epoch": 18.37617554858934, - "grad_norm": 0.00047241951688192785, - "learning_rate": 2.4311625217457778e-06, - "loss": 0.0, + "epoch": 3.9181636726546905, + "grad_norm": 38.201255798339844, + "learning_rate": 0.00018308078967080546, + "loss": 0.2716, "step": 1470 }, { - "epoch": 18.438871473354233, - "grad_norm": 0.0003394366940483451, - "learning_rate": 2.2159722614573996e-06, - "loss": 0.0, + "epoch": 3.9314703925482366, + "grad_norm": 19.68052864074707, + "learning_rate": 0.0001829628609088036, + "loss": 0.4382, "step": 1475 }, { - "epoch": 18.50156739811912, - "grad_norm": 0.000773964449763298, - "learning_rate": 2.010645489336382e-06, - "loss": 0.0, + "epoch": 3.944777112441783, + "grad_norm": 4.916224956512451, + "learning_rate": 0.00018284456084214496, + "loss": 0.4168, "step": 1480 }, { - "epoch": 18.564263322884013, - "grad_norm": 6.701203528791666e-05, - "learning_rate": 1.8152029167221475e-06, - "loss": 0.0, + "epoch": 3.9580838323353293, + "grad_norm": 8.401510238647461, + "learning_rate": 0.00018272589000028772, + "loss": 0.4438, "step": 1485 }, { - "epoch": 18.6269592476489, - "grad_norm": 0.0003426434122957289, - "learning_rate": 1.6296642579335496e-06, - "loss": 0.0, + "epoch": 3.9713905522288755, + "grad_norm": 5.434607982635498, + "learning_rate": 0.0001826068489143495, + "loss": 0.3375, "step": 1490 }, { - "epoch": 18.689655172413794, - "grad_norm": 0.00010157287761103362, - "learning_rate": 1.4540482282803137e-06, - "loss": 0.0, + "epoch": 3.984697272122422, + "grad_norm": 2.88216495513916, + "learning_rate": 0.00018248743811710488, + "loss": 0.3334, "step": 1495 }, { - "epoch": 18.752351097178682, - "grad_norm": 0.0003385106392670423, - "learning_rate": 1.2883725421752201e-06, - "loss": 0.0, + "epoch": 3.998003992015968, + "grad_norm": 8.627459526062012, + "learning_rate": 0.0001823676581429833, + "loss": 0.3474, "step": 1500 }, { - "epoch": 18.815047021943574, - "grad_norm": 0.00031495324219577014, - "learning_rate": 1.132653911347248e-06, - "loss": 0.0, + "epoch": 3.998003992015968, + "eval_loss": 0.6241031289100647, + "eval_macro_f1": 65.75275685358942, + "eval_macro_precision": 66.95092371496796, + "eval_macro_recall": 66.23136102415846, + "eval_micro_f1": 85.55389221556887, + "eval_micro_precision": 85.55389221556887, + "eval_micro_recall": 85.55389221556887, + "eval_runtime": 4.7326, + "eval_samples_per_second": 564.595, + "eval_steps_per_second": 35.287, + "step": 1500 + }, + { + "epoch": 4.013306719893547, + "grad_norm": 1.1038553714752197, + "learning_rate": 0.00018224750952806624, + "loss": 0.0693, "step": 1505 }, { - "epoch": 18.877742946708462, - "grad_norm": 2.4275641408166848e-05, - "learning_rate": 9.869080431558542e-07, - "loss": 0.0, + "epoch": 4.026613439787092, + "grad_norm": 0.036140475422143936, + "learning_rate": 0.0001821269928100852, + "loss": 0.0629, "step": 1510 }, { - "epoch": 18.940438871473354, - "grad_norm": 0.0007264981977641582, - "learning_rate": 8.511496390065543e-07, - "loss": 0.0, + "epoch": 4.039920159680639, + "grad_norm": 16.097137451171875, + "learning_rate": 0.00018200610852841913, + "loss": 0.1647, "step": 1515 }, { - "epoch": 19.0, - "grad_norm": 3.2049887522589415e-05, - "learning_rate": 7.253923928680406e-07, - "loss": 0.0, - "step": 1520 - }, - { - "epoch": 19.0, - "eval_loss": 2.983579158782959, - "eval_macro_f1": 74.59288643616037, - "eval_macro_precision": 74.61805414620287, - "eval_macro_recall": 74.57142857142857, - "eval_micro_f1": 74.57142857142857, - "eval_micro_precision": 74.57142857142857, - "eval_micro_recall": 74.57142857142857, - "eval_runtime": 3.5338, - "eval_samples_per_second": 297.133, - "eval_steps_per_second": 18.677, + "epoch": 4.053226879574185, + "grad_norm": 97.72679138183594, + "learning_rate": 0.00018188485722409197, + "loss": 0.3568, "step": 1520 }, { - "epoch": 19.062695924764892, - "grad_norm": 0.00015775053179822862, - "learning_rate": 6.096489898908208e-07, - "loss": 0.0, + "epoch": 4.066533599467731, + "grad_norm": 26.921789169311523, + "learning_rate": 0.00018176323943977033, + "loss": 0.314, "step": 1525 }, { - "epoch": 19.12539184952978, - "grad_norm": 0.00020885077537968755, - "learning_rate": 5.039311051276752e-07, - "loss": 0.0, + "epoch": 4.079840319361278, + "grad_norm": 29.175439834594727, + "learning_rate": 0.00018164125571976098, + "loss": 0.151, "step": 1530 }, { - "epoch": 19.188087774294672, - "grad_norm": 0.0004631892079487443, - "learning_rate": 4.082494023560091e-07, - "loss": 0.0, + "epoch": 4.0931470392548235, + "grad_norm": 36.96811294555664, + "learning_rate": 0.00018151890661000856, + "loss": 0.1288, "step": 1535 }, { - "epoch": 19.25078369905956, - "grad_norm": 7.329711661441252e-05, - "learning_rate": 3.2261353300219176e-07, - "loss": 0.0, + "epoch": 4.10645375914837, + "grad_norm": 17.12455177307129, + "learning_rate": 0.0001813961926580929, + "loss": 0.1292, "step": 1540 }, { - "epoch": 19.313479623824453, - "grad_norm": 0.000103279686300084, - "learning_rate": 2.4703213516799053e-07, - "loss": 0.0, + "epoch": 4.119760479041916, + "grad_norm": 6.079716205596924, + "learning_rate": 0.0001812731144132268, + "loss": 0.2966, "step": 1545 }, { - "epoch": 19.37617554858934, - "grad_norm": 0.00035780860343948007, - "learning_rate": 1.8151283275928964e-07, - "loss": 0.0, + "epoch": 4.133067198935462, + "grad_norm": 6.217795372009277, + "learning_rate": 0.00018114967242625343, + "loss": 0.0419, "step": 1550 }, { - "epoch": 19.438871473354233, - "grad_norm": 0.0009104281198233366, - "learning_rate": 1.2606223471702817e-07, - "loss": 0.0, + "epoch": 4.146373918829009, + "grad_norm": 4.027215957641602, + "learning_rate": 0.00018102586724964387, + "loss": 0.1433, "step": 1555 }, { - "epoch": 19.50156739811912, - "grad_norm": 0.0008115767268463969, - "learning_rate": 8.068593435055505e-08, - "loss": 0.0, + "epoch": 4.159680638722555, + "grad_norm": 0.21189923584461212, + "learning_rate": 0.00018090169943749476, + "loss": 0.2901, "step": 1560 }, { - "epoch": 19.564263322884013, - "grad_norm": 2.108391527144704e-05, - "learning_rate": 4.5388508773469564e-08, - "loss": 0.0, + "epoch": 4.172987358616101, + "grad_norm": 0.8172288537025452, + "learning_rate": 0.00018077716954552564, + "loss": 0.2699, "step": 1565 }, { - "epoch": 19.6269592476489, - "grad_norm": 0.00026582309510558844, - "learning_rate": 2.0173518441868324e-08, - "loss": 0.0, + "epoch": 4.186294078509647, + "grad_norm": 1.5100902318954468, + "learning_rate": 0.00018065227813107666, + "loss": 0.1433, "step": 1570 }, { - "epoch": 19.689655172413794, - "grad_norm": 0.0001829984103096649, - "learning_rate": 5.043506795276987e-09, - "loss": 0.0, + "epoch": 4.199600798403194, + "grad_norm": 19.998355865478516, + "learning_rate": 0.00018052702575310588, + "loss": 0.2386, "step": 1575 }, { - "epoch": 19.752351097178682, - "grad_norm": 0.00031997705809772015, - "learning_rate": 0.0, - "loss": 0.0, + "epoch": 4.21290751829674, + "grad_norm": 2.6985023021698, + "learning_rate": 0.00018040141297218695, + "loss": 0.0853, "step": 1580 }, { - "epoch": 19.752351097178682, - "eval_loss": 2.983431816101074, - "eval_macro_f1": 74.495166799055, - "eval_macro_precision": 74.51671347356447, - "eval_macro_recall": 74.47619047619047, - "eval_micro_f1": 74.47619047619047, - "eval_micro_precision": 74.47619047619047, - "eval_micro_recall": 74.47619047619047, - "eval_runtime": 5.0918, - "eval_samples_per_second": 206.215, - "eval_steps_per_second": 12.962, - "step": 1580 + "epoch": 4.226214238190286, + "grad_norm": 3.395392894744873, + "learning_rate": 0.00018027544035050644, + "loss": 0.1964, + "step": 1585 + }, + { + "epoch": 4.2395209580838324, + "grad_norm": 7.006717681884766, + "learning_rate": 0.00018014910845186153, + "loss": 0.2763, + "step": 1590 + }, + { + "epoch": 4.252827677977379, + "grad_norm": 1.993622899055481, + "learning_rate": 0.00018002241784165728, + "loss": 0.2098, + "step": 1595 + }, + { + "epoch": 4.266134397870925, + "grad_norm": 0.10528213530778885, + "learning_rate": 0.00017989536908690412, + "loss": 0.2781, + "step": 1600 + }, + { + "epoch": 4.279441117764471, + "grad_norm": 1.242319107055664, + "learning_rate": 0.00017976796275621555, + "loss": 0.029, + "step": 1605 + }, + { + "epoch": 4.292747837658017, + "grad_norm": 6.547476291656494, + "learning_rate": 0.00017964019941980522, + "loss": 0.4249, + "step": 1610 + }, + { + "epoch": 4.306054557551564, + "grad_norm": 6.174795150756836, + "learning_rate": 0.0001795120796494848, + "loss": 0.2151, + "step": 1615 + }, + { + "epoch": 4.319361277445109, + "grad_norm": 34.997474670410156, + "learning_rate": 0.00017938360401866093, + "loss": 0.0941, + "step": 1620 + }, + { + "epoch": 4.332667997338656, + "grad_norm": 4.191906452178955, + "learning_rate": 0.00017925477310233316, + "loss": 0.0643, + "step": 1625 + }, + { + "epoch": 4.3459747172322025, + "grad_norm": 0.43293797969818115, + "learning_rate": 0.00017912558747709104, + "loss": 0.1264, + "step": 1630 + }, + { + "epoch": 4.359281437125748, + "grad_norm": 24.775293350219727, + "learning_rate": 0.00017899604772111163, + "loss": 0.3707, + "step": 1635 + }, + { + "epoch": 4.372588157019295, + "grad_norm": 1.0308302640914917, + "learning_rate": 0.0001788661544141569, + "loss": 0.4379, + "step": 1640 + }, + { + "epoch": 4.385894876912841, + "grad_norm": 7.142487525939941, + "learning_rate": 0.0001787359081375713, + "loss": 0.3458, + "step": 1645 + }, + { + "epoch": 4.399201596806387, + "grad_norm": 18.565332412719727, + "learning_rate": 0.00017860530947427875, + "loss": 0.241, + "step": 1650 + }, + { + "epoch": 4.412508316699934, + "grad_norm": 1.515366554260254, + "learning_rate": 0.00017847435900878058, + "loss": 0.2142, + "step": 1655 + }, + { + "epoch": 4.425815036593479, + "grad_norm": 2.90205454826355, + "learning_rate": 0.0001783430573271524, + "loss": 0.1881, + "step": 1660 + }, + { + "epoch": 4.439121756487026, + "grad_norm": 3.373195171356201, + "learning_rate": 0.00017821140501704194, + "loss": 0.2427, + "step": 1665 + }, + { + "epoch": 4.452428476380573, + "grad_norm": 4.514166831970215, + "learning_rate": 0.00017807940266766593, + "loss": 0.2774, + "step": 1670 + }, + { + "epoch": 4.465735196274118, + "grad_norm": 16.700340270996094, + "learning_rate": 0.0001779470508698079, + "loss": 0.1988, + "step": 1675 + }, + { + "epoch": 4.479041916167665, + "grad_norm": 7.546816349029541, + "learning_rate": 0.00017781435021581527, + "loss": 0.2984, + "step": 1680 + }, + { + "epoch": 4.492348636061211, + "grad_norm": 1.9645479917526245, + "learning_rate": 0.00017768130129959683, + "loss": 0.3073, + "step": 1685 + }, + { + "epoch": 4.505655355954757, + "grad_norm": 37.8007926940918, + "learning_rate": 0.00017754790471662002, + "loss": 0.3818, + "step": 1690 + }, + { + "epoch": 4.518962075848304, + "grad_norm": 8.654253959655762, + "learning_rate": 0.00017741416106390826, + "loss": 0.5542, + "step": 1695 + }, + { + "epoch": 4.5322687957418495, + "grad_norm": 7.000682830810547, + "learning_rate": 0.0001772800709400383, + "loss": 0.5024, + "step": 1700 + }, + { + "epoch": 4.545575515635396, + "grad_norm": 11.131951332092285, + "learning_rate": 0.00017714563494513764, + "loss": 0.2344, + "step": 1705 + }, + { + "epoch": 4.558882235528942, + "grad_norm": 4.567352771759033, + "learning_rate": 0.00017701085368088156, + "loss": 0.2835, + "step": 1710 + }, + { + "epoch": 4.572188955422488, + "grad_norm": 12.048651695251465, + "learning_rate": 0.0001768757277504908, + "loss": 0.1937, + "step": 1715 + }, + { + "epoch": 4.585495675316035, + "grad_norm": 21.26102638244629, + "learning_rate": 0.00017674025775872852, + "loss": 0.3544, + "step": 1720 + }, + { + "epoch": 4.598802395209581, + "grad_norm": 8.841492652893066, + "learning_rate": 0.0001766044443118978, + "loss": 0.3688, + "step": 1725 + }, + { + "epoch": 4.612109115103127, + "grad_norm": 7.04481315612793, + "learning_rate": 0.00017646828801783895, + "loss": 0.2074, + "step": 1730 + }, + { + "epoch": 4.625415834996673, + "grad_norm": 6.048544406890869, + "learning_rate": 0.0001763317894859266, + "loss": 0.1041, + "step": 1735 + }, + { + "epoch": 4.63872255489022, + "grad_norm": 15.356284141540527, + "learning_rate": 0.0001761949493270671, + "loss": 0.1728, + "step": 1740 + }, + { + "epoch": 4.652029274783766, + "grad_norm": 1.0128647089004517, + "learning_rate": 0.00017605776815369581, + "loss": 0.1081, + "step": 1745 + }, + { + "epoch": 4.665335994677312, + "grad_norm": 23.05744743347168, + "learning_rate": 0.00017592024657977432, + "loss": 0.2708, + "step": 1750 + }, + { + "epoch": 4.6786427145708585, + "grad_norm": 17.319061279296875, + "learning_rate": 0.0001757823852207877, + "loss": 0.1197, + "step": 1755 + }, + { + "epoch": 4.691949434464404, + "grad_norm": 6.973044395446777, + "learning_rate": 0.00017564418469374167, + "loss": 0.4926, + "step": 1760 + }, + { + "epoch": 4.705256154357951, + "grad_norm": 44.00906753540039, + "learning_rate": 0.00017550564561716, + "loss": 0.1962, + "step": 1765 + }, + { + "epoch": 4.718562874251497, + "grad_norm": 3.3621838092803955, + "learning_rate": 0.00017536676861108164, + "loss": 0.181, + "step": 1770 + }, + { + "epoch": 4.731869594145043, + "grad_norm": 8.182963371276855, + "learning_rate": 0.00017522755429705798, + "loss": 0.0915, + "step": 1775 + }, + { + "epoch": 4.74517631403859, + "grad_norm": 27.33730697631836, + "learning_rate": 0.00017508800329814995, + "loss": 0.3047, + "step": 1780 + }, + { + "epoch": 4.758483033932135, + "grad_norm": 20.636539459228516, + "learning_rate": 0.0001749481162389254, + "loss": 0.3657, + "step": 1785 + }, + { + "epoch": 4.771789753825682, + "grad_norm": 26.856287002563477, + "learning_rate": 0.00017480789374545633, + "loss": 0.4397, + "step": 1790 + }, + { + "epoch": 4.7850964737192285, + "grad_norm": 6.763969898223877, + "learning_rate": 0.0001746673364453158, + "loss": 0.2756, + "step": 1795 + }, + { + "epoch": 4.798403193612774, + "grad_norm": 2.868919849395752, + "learning_rate": 0.0001745264449675755, + "loss": 0.1183, + "step": 1800 + }, + { + "epoch": 4.811709913506321, + "grad_norm": 13.854222297668457, + "learning_rate": 0.00017438521994280257, + "loss": 0.2545, + "step": 1805 + }, + { + "epoch": 4.825016633399867, + "grad_norm": 14.696944236755371, + "learning_rate": 0.00017424366200305714, + "loss": 0.3584, + "step": 1810 + }, + { + "epoch": 4.838323353293413, + "grad_norm": 18.185508728027344, + "learning_rate": 0.00017410177178188918, + "loss": 0.4049, + "step": 1815 + }, + { + "epoch": 4.85163007318696, + "grad_norm": 3.704495429992676, + "learning_rate": 0.00017395954991433586, + "loss": 0.3104, + "step": 1820 + }, + { + "epoch": 4.864936793080505, + "grad_norm": 16.54859161376953, + "learning_rate": 0.00017381699703691866, + "loss": 0.3405, + "step": 1825 + }, + { + "epoch": 4.878243512974052, + "grad_norm": 8.207486152648926, + "learning_rate": 0.0001736741137876405, + "loss": 0.2789, + "step": 1830 + }, + { + "epoch": 4.891550232867598, + "grad_norm": 2.0907087326049805, + "learning_rate": 0.0001735309008059829, + "loss": 0.1476, + "step": 1835 + }, + { + "epoch": 4.904856952761144, + "grad_norm": 7.623042106628418, + "learning_rate": 0.00017338735873290303, + "loss": 0.3736, + "step": 1840 + }, + { + "epoch": 4.918163672654691, + "grad_norm": 6.89530086517334, + "learning_rate": 0.0001732434882108311, + "loss": 0.1917, + "step": 1845 + }, + { + "epoch": 4.931470392548237, + "grad_norm": 46.501251220703125, + "learning_rate": 0.00017309928988366716, + "loss": 0.1845, + "step": 1850 + }, + { + "epoch": 4.944777112441783, + "grad_norm": 14.652153015136719, + "learning_rate": 0.00017295476439677847, + "loss": 0.2152, + "step": 1855 + }, + { + "epoch": 4.95808383233533, + "grad_norm": 16.384225845336914, + "learning_rate": 0.00017280991239699642, + "loss": 0.3643, + "step": 1860 + }, + { + "epoch": 4.9713905522288755, + "grad_norm": 1.2959167957305908, + "learning_rate": 0.00017266473453261377, + "loss": 0.341, + "step": 1865 + }, + { + "epoch": 4.984697272122422, + "grad_norm": 16.964174270629883, + "learning_rate": 0.00017251923145338176, + "loss": 0.393, + "step": 1870 + }, + { + "epoch": 4.998003992015968, + "grad_norm": 3.103013038635254, + "learning_rate": 0.00017237340381050703, + "loss": 0.2354, + "step": 1875 + }, + { + "epoch": 4.998003992015968, + "eval_loss": 0.6098228693008423, + "eval_macro_f1": 64.20293730741194, + "eval_macro_precision": 67.9233057601082, + "eval_macro_recall": 61.45233240477489, + "eval_micro_f1": 86.78892215568862, + "eval_micro_precision": 86.78892215568862, + "eval_micro_recall": 86.78892215568862, + "eval_runtime": 4.8204, + "eval_samples_per_second": 554.308, + "eval_steps_per_second": 34.644, + "step": 1875 + }, + { + "epoch": 5.013306719893547, + "grad_norm": 10.461470603942871, + "learning_rate": 0.0001722272522566489, + "loss": 0.1565, + "step": 1880 + }, + { + "epoch": 5.026613439787092, + "grad_norm": 3.9339942932128906, + "learning_rate": 0.00017208077744591634, + "loss": 0.1466, + "step": 1885 + }, + { + "epoch": 5.039920159680639, + "grad_norm": 7.42096471786499, + "learning_rate": 0.0001719339800338651, + "loss": 0.0382, + "step": 1890 + }, + { + "epoch": 5.053226879574185, + "grad_norm": 0.58461594581604, + "learning_rate": 0.0001717868606774948, + "loss": 0.1592, + "step": 1895 + }, + { + "epoch": 5.066533599467731, + "grad_norm": 1.377388596534729, + "learning_rate": 0.00017163942003524572, + "loss": 0.0645, + "step": 1900 + }, + { + "epoch": 5.079840319361278, + "grad_norm": 0.42152106761932373, + "learning_rate": 0.00017149165876699635, + "loss": 0.0022, + "step": 1905 + }, + { + "epoch": 5.0931470392548235, + "grad_norm": 0.027353579178452492, + "learning_rate": 0.00017134357753406003, + "loss": 0.2095, + "step": 1910 + }, + { + "epoch": 5.10645375914837, + "grad_norm": 44.416969299316406, + "learning_rate": 0.00017119517699918207, + "loss": 0.0876, + "step": 1915 + }, + { + "epoch": 5.119760479041916, + "grad_norm": 19.71007537841797, + "learning_rate": 0.0001710464578265369, + "loss": 0.0498, + "step": 1920 + }, + { + "epoch": 5.133067198935462, + "grad_norm": 8.512078285217285, + "learning_rate": 0.000170897420681725, + "loss": 0.2547, + "step": 1925 + }, + { + "epoch": 5.146373918829009, + "grad_norm": 0.14262406527996063, + "learning_rate": 0.00017074806623177, + "loss": 0.5994, + "step": 1930 + }, + { + "epoch": 5.159680638722555, + "grad_norm": 3.3490352630615234, + "learning_rate": 0.00017059839514511565, + "loss": 0.2067, + "step": 1935 + }, + { + "epoch": 5.172987358616101, + "grad_norm": 13.887410163879395, + "learning_rate": 0.00017044840809162271, + "loss": 0.3215, + "step": 1940 + }, + { + "epoch": 5.186294078509647, + "grad_norm": 1.4734302759170532, + "learning_rate": 0.0001702981057425662, + "loss": 0.1817, + "step": 1945 + }, + { + "epoch": 5.199600798403194, + "grad_norm": 30.033546447753906, + "learning_rate": 0.00017014748877063214, + "loss": 0.1956, + "step": 1950 + }, + { + "epoch": 5.21290751829674, + "grad_norm": 15.2096529006958, + "learning_rate": 0.00016999655784991478, + "loss": 0.2823, + "step": 1955 + }, + { + "epoch": 5.226214238190286, + "grad_norm": 40.786888122558594, + "learning_rate": 0.0001698453136559134, + "loss": 0.3874, + "step": 1960 + }, + { + "epoch": 5.2395209580838324, + "grad_norm": 21.138729095458984, + "learning_rate": 0.00016969375686552937, + "loss": 0.1977, + "step": 1965 + }, + { + "epoch": 5.252827677977379, + "grad_norm": 39.89869689941406, + "learning_rate": 0.00016954188815706305, + "loss": 0.2877, + "step": 1970 + }, + { + "epoch": 5.266134397870925, + "grad_norm": 4.198141098022461, + "learning_rate": 0.0001693897082102109, + "loss": 0.1792, + "step": 1975 + }, + { + "epoch": 5.279441117764471, + "grad_norm": 25.046525955200195, + "learning_rate": 0.00016923721770606228, + "loss": 0.1408, + "step": 1980 + }, + { + "epoch": 5.292747837658017, + "grad_norm": 31.608402252197266, + "learning_rate": 0.0001690844173270965, + "loss": 0.2465, + "step": 1985 + }, + { + "epoch": 5.306054557551564, + "grad_norm": 0.058475542813539505, + "learning_rate": 0.00016893130775717962, + "loss": 0.1367, + "step": 1990 + }, + { + "epoch": 5.319361277445109, + "grad_norm": 23.879507064819336, + "learning_rate": 0.0001687778896815617, + "loss": 0.1272, + "step": 1995 + }, + { + "epoch": 5.332667997338656, + "grad_norm": 11.176948547363281, + "learning_rate": 0.0001686241637868734, + "loss": 0.2328, + "step": 2000 + }, + { + "epoch": 5.3459747172322025, + "grad_norm": 62.34965515136719, + "learning_rate": 0.000168470130761123, + "loss": 0.1621, + "step": 2005 + }, + { + "epoch": 5.359281437125748, + "grad_norm": 18.16000747680664, + "learning_rate": 0.00016831579129369346, + "loss": 0.2798, + "step": 2010 + }, + { + "epoch": 5.372588157019295, + "grad_norm": 30.744770050048828, + "learning_rate": 0.00016816114607533922, + "loss": 0.2956, + "step": 2015 + }, + { + "epoch": 5.385894876912841, + "grad_norm": 2.792100429534912, + "learning_rate": 0.00016800619579818312, + "loss": 0.1253, + "step": 2020 + }, + { + "epoch": 5.399201596806387, + "grad_norm": 0.24668025970458984, + "learning_rate": 0.00016785094115571322, + "loss": 0.1933, + "step": 2025 + }, + { + "epoch": 5.412508316699934, + "grad_norm": 9.7402925491333, + "learning_rate": 0.00016769538284277995, + "loss": 0.0581, + "step": 2030 + }, + { + "epoch": 5.425815036593479, + "grad_norm": 1.663047432899475, + "learning_rate": 0.00016753952155559266, + "loss": 0.2041, + "step": 2035 + }, + { + "epoch": 5.439121756487026, + "grad_norm": 3.0766444206237793, + "learning_rate": 0.00016738335799171682, + "loss": 0.4174, + "step": 2040 + }, + { + "epoch": 5.452428476380573, + "grad_norm": 50.167137145996094, + "learning_rate": 0.0001672268928500706, + "loss": 0.2974, + "step": 2045 + }, + { + "epoch": 5.465735196274118, + "grad_norm": 1.0232924222946167, + "learning_rate": 0.00016707012683092208, + "loss": 0.2376, + "step": 2050 + }, + { + "epoch": 5.479041916167665, + "grad_norm": 5.915070056915283, + "learning_rate": 0.00016691306063588583, + "loss": 0.0967, + "step": 2055 + }, + { + "epoch": 5.492348636061211, + "grad_norm": 3.8537585735321045, + "learning_rate": 0.00016675569496791984, + "loss": 0.3152, + "step": 2060 + }, + { + "epoch": 5.505655355954757, + "grad_norm": 1.8343299627304077, + "learning_rate": 0.00016659803053132249, + "loss": 0.1102, + "step": 2065 + }, + { + "epoch": 5.518962075848304, + "grad_norm": 3.2312400341033936, + "learning_rate": 0.00016644006803172924, + "loss": 0.189, + "step": 2070 + }, + { + "epoch": 5.5322687957418495, + "grad_norm": 6.797444820404053, + "learning_rate": 0.00016628180817610964, + "loss": 0.3289, + "step": 2075 + }, + { + "epoch": 5.545575515635396, + "grad_norm": 5.592912673950195, + "learning_rate": 0.00016612325167276394, + "loss": 0.3153, + "step": 2080 + }, + { + "epoch": 5.558882235528942, + "grad_norm": 2.010084629058838, + "learning_rate": 0.00016596439923132017, + "loss": 0.1268, + "step": 2085 + }, + { + "epoch": 5.572188955422488, + "grad_norm": 30.318086624145508, + "learning_rate": 0.0001658052515627308, + "loss": 0.3523, + "step": 2090 + }, + { + "epoch": 5.585495675316035, + "grad_norm": 27.016624450683594, + "learning_rate": 0.00016564580937926962, + "loss": 0.2108, + "step": 2095 + }, + { + "epoch": 5.598802395209581, + "grad_norm": 9.180904388427734, + "learning_rate": 0.00016548607339452853, + "loss": 0.0995, + "step": 2100 + }, + { + "epoch": 5.612109115103127, + "grad_norm": 0.2903202474117279, + "learning_rate": 0.0001653260443234143, + "loss": 0.0875, + "step": 2105 + }, + { + "epoch": 5.625415834996673, + "grad_norm": 3.65264630317688, + "learning_rate": 0.00016516572288214552, + "loss": 0.4259, + "step": 2110 + }, + { + "epoch": 5.63872255489022, + "grad_norm": 26.856353759765625, + "learning_rate": 0.00016500510978824926, + "loss": 0.176, + "step": 2115 + }, + { + "epoch": 5.652029274783766, + "grad_norm": 28.680461883544922, + "learning_rate": 0.00016484420576055785, + "loss": 0.2412, + "step": 2120 + }, + { + "epoch": 5.665335994677312, + "grad_norm": 2.5007455348968506, + "learning_rate": 0.00016468301151920575, + "loss": 0.1002, + "step": 2125 + }, + { + "epoch": 5.6786427145708585, + "grad_norm": 13.849712371826172, + "learning_rate": 0.0001645215277856263, + "loss": 0.1694, + "step": 2130 + }, + { + "epoch": 5.691949434464404, + "grad_norm": 0.9836284518241882, + "learning_rate": 0.0001643597552825485, + "loss": 0.4659, + "step": 2135 + }, + { + "epoch": 5.705256154357951, + "grad_norm": 6.066076755523682, + "learning_rate": 0.00016419769473399363, + "loss": 0.2144, + "step": 2140 + }, + { + "epoch": 5.718562874251497, + "grad_norm": 0.6988075375556946, + "learning_rate": 0.00016403534686527225, + "loss": 0.3341, + "step": 2145 + }, + { + "epoch": 5.731869594145043, + "grad_norm": 1.4430186748504639, + "learning_rate": 0.0001638727124029808, + "loss": 0.117, + "step": 2150 + }, + { + "epoch": 5.74517631403859, + "grad_norm": 0.7749199867248535, + "learning_rate": 0.00016370979207499845, + "loss": 0.0893, + "step": 2155 + }, + { + "epoch": 5.758483033932135, + "grad_norm": 2.697474479675293, + "learning_rate": 0.00016354658661048364, + "loss": 0.2948, + "step": 2160 + }, + { + "epoch": 5.771789753825682, + "grad_norm": 0.0894465297460556, + "learning_rate": 0.00016338309673987101, + "loss": 0.1375, + "step": 2165 + }, + { + "epoch": 5.7850964737192285, + "grad_norm": 7.317739963531494, + "learning_rate": 0.0001632193231948682, + "loss": 0.1734, + "step": 2170 + }, + { + "epoch": 5.798403193612774, + "grad_norm": 2.099191665649414, + "learning_rate": 0.00016305526670845226, + "loss": 0.1395, + "step": 2175 }, { - "epoch": 19.752351097178682, - "step": 1580, - "total_flos": 8695327329615872.0, - "train_loss": 0.2892922113949427, - "train_runtime": 830.7771, - "train_samples_per_second": 122.777, - "train_steps_per_second": 1.902 + "epoch": 5.811709913506321, + "grad_norm": 3.64398193359375, + "learning_rate": 0.00016289092801486667, + "loss": 0.1896, + "step": 2180 + }, + { + "epoch": 5.825016633399867, + "grad_norm": 3.6657869815826416, + "learning_rate": 0.00016272630784961787, + "loss": 0.1412, + "step": 2185 + }, + { + "epoch": 5.838323353293413, + "grad_norm": 1.3434737920761108, + "learning_rate": 0.00016256140694947217, + "loss": 0.1477, + "step": 2190 + }, + { + "epoch": 5.85163007318696, + "grad_norm": 0.9189901351928711, + "learning_rate": 0.00016239622605245216, + "loss": 0.1273, + "step": 2195 + }, + { + "epoch": 5.864936793080505, + "grad_norm": 42.35541534423828, + "learning_rate": 0.00016223076589783368, + "loss": 0.1507, + "step": 2200 + }, + { + "epoch": 5.878243512974052, + "grad_norm": 5.649895668029785, + "learning_rate": 0.00016206502722614238, + "loss": 0.2484, + "step": 2205 + }, + { + "epoch": 5.891550232867598, + "grad_norm": 1.5216484069824219, + "learning_rate": 0.00016189901077915043, + "loss": 0.0698, + "step": 2210 + }, + { + "epoch": 5.904856952761144, + "grad_norm": 18.50678825378418, + "learning_rate": 0.0001617327172998732, + "loss": 0.1503, + "step": 2215 + }, + { + "epoch": 5.918163672654691, + "grad_norm": 25.463354110717773, + "learning_rate": 0.0001615661475325658, + "loss": 0.2796, + "step": 2220 + }, + { + "epoch": 5.931470392548237, + "grad_norm": 0.9343916773796082, + "learning_rate": 0.0001613993022227202, + "loss": 0.4415, + "step": 2225 + }, + { + "epoch": 5.944777112441783, + "grad_norm": 0.4629737436771393, + "learning_rate": 0.00016123218211706126, + "loss": 0.2218, + "step": 2230 + }, + { + "epoch": 5.95808383233533, + "grad_norm": 4.891594886779785, + "learning_rate": 0.00016106478796354382, + "loss": 0.1428, + "step": 2235 + }, + { + "epoch": 5.9713905522288755, + "grad_norm": 0.6886902451515198, + "learning_rate": 0.00016089712051134926, + "loss": 0.1396, + "step": 2240 + }, + { + "epoch": 5.984697272122422, + "grad_norm": 25.898942947387695, + "learning_rate": 0.0001607291805108821, + "loss": 0.1493, + "step": 2245 + }, + { + "epoch": 5.998003992015968, + "grad_norm": 9.379130363464355, + "learning_rate": 0.00016056096871376667, + "loss": 0.2857, + "step": 2250 + }, + { + "epoch": 5.998003992015968, + "eval_loss": 0.8669371604919434, + "eval_macro_f1": 67.23263039823122, + "eval_macro_precision": 68.19497057722046, + "eval_macro_recall": 67.175028478983, + "eval_micro_f1": 86.78892215568862, + "eval_micro_precision": 86.78892215568862, + "eval_micro_recall": 86.78892215568862, + "eval_runtime": 4.5626, + "eval_samples_per_second": 585.632, + "eval_steps_per_second": 36.602, + "step": 2250 + }, + { + "epoch": 6.013306719893547, + "grad_norm": 0.7099379301071167, + "learning_rate": 0.00016039248587284373, + "loss": 0.1042, + "step": 2255 + }, + { + "epoch": 6.026613439787092, + "grad_norm": 0.3128296732902527, + "learning_rate": 0.0001602237327421671, + "loss": 0.1591, + "step": 2260 + }, + { + "epoch": 6.039920159680639, + "grad_norm": 0.34373024106025696, + "learning_rate": 0.00016005471007700031, + "loss": 0.0785, + "step": 2265 + }, + { + "epoch": 6.053226879574185, + "grad_norm": 0.020252332091331482, + "learning_rate": 0.00015988541863381323, + "loss": 0.1448, + "step": 2270 + }, + { + "epoch": 6.066533599467731, + "grad_norm": 0.09661275893449783, + "learning_rate": 0.00015971585917027862, + "loss": 0.0803, + "step": 2275 + }, + { + "epoch": 6.079840319361278, + "grad_norm": 49.84988021850586, + "learning_rate": 0.0001595460324452688, + "loss": 0.1813, + "step": 2280 + }, + { + "epoch": 6.0931470392548235, + "grad_norm": 0.08736708760261536, + "learning_rate": 0.00015937593921885225, + "loss": 0.0707, + "step": 2285 + }, + { + "epoch": 6.10645375914837, + "grad_norm": 38.021400451660156, + "learning_rate": 0.00015920558025229014, + "loss": 0.2836, + "step": 2290 + }, + { + "epoch": 6.119760479041916, + "grad_norm": 0.023501181975007057, + "learning_rate": 0.000159034956308033, + "loss": 0.081, + "step": 2295 + }, + { + "epoch": 6.133067198935462, + "grad_norm": 6.965376853942871, + "learning_rate": 0.00015886406814971728, + "loss": 0.2449, + "step": 2300 + }, + { + "epoch": 6.146373918829009, + "grad_norm": 24.30607032775879, + "learning_rate": 0.000158692916542162, + "loss": 0.1398, + "step": 2305 + }, + { + "epoch": 6.159680638722555, + "grad_norm": 3.2971627712249756, + "learning_rate": 0.00015852150225136518, + "loss": 0.246, + "step": 2310 + }, + { + "epoch": 6.172987358616101, + "grad_norm": 0.4873849153518677, + "learning_rate": 0.00015834982604450045, + "loss": 0.1162, + "step": 2315 + }, + { + "epoch": 6.186294078509647, + "grad_norm": 21.559432983398438, + "learning_rate": 0.00015817788868991377, + "loss": 0.2077, + "step": 2320 + }, + { + "epoch": 6.199600798403194, + "grad_norm": 47.79915237426758, + "learning_rate": 0.00015800569095711982, + "loss": 0.2076, + "step": 2325 + }, + { + "epoch": 6.21290751829674, + "grad_norm": 11.995939254760742, + "learning_rate": 0.00015783323361679864, + "loss": 0.2112, + "step": 2330 + }, + { + "epoch": 6.226214238190286, + "grad_norm": 5.725085258483887, + "learning_rate": 0.00015766051744079218, + "loss": 0.3894, + "step": 2335 + }, + { + "epoch": 6.2395209580838324, + "grad_norm": 22.34183120727539, + "learning_rate": 0.00015748754320210072, + "loss": 0.1221, + "step": 2340 + }, + { + "epoch": 6.252827677977379, + "grad_norm": 8.4489107131958, + "learning_rate": 0.00015731431167487965, + "loss": 0.1507, + "step": 2345 + }, + { + "epoch": 6.266134397870925, + "grad_norm": 9.290181159973145, + "learning_rate": 0.00015714082363443575, + "loss": 0.2869, + "step": 2350 + }, + { + "epoch": 6.279441117764471, + "grad_norm": 3.636052131652832, + "learning_rate": 0.0001569670798572239, + "loss": 0.226, + "step": 2355 + }, + { + "epoch": 6.292747837658017, + "grad_norm": 29.546716690063477, + "learning_rate": 0.0001567930811208435, + "loss": 0.1516, + "step": 2360 + }, + { + "epoch": 6.306054557551564, + "grad_norm": 6.995582580566406, + "learning_rate": 0.00015661882820403516, + "loss": 0.0518, + "step": 2365 + }, + { + "epoch": 6.319361277445109, + "grad_norm": 4.880093574523926, + "learning_rate": 0.00015644432188667695, + "loss": 0.0536, + "step": 2370 + }, + { + "epoch": 6.332667997338656, + "grad_norm": 0.18312983214855194, + "learning_rate": 0.00015626956294978103, + "loss": 0.0382, + "step": 2375 + }, + { + "epoch": 6.3459747172322025, + "grad_norm": 0.03832171857357025, + "learning_rate": 0.00015609455217549032, + "loss": 0.156, + "step": 2380 + }, + { + "epoch": 6.359281437125748, + "grad_norm": 47.08546829223633, + "learning_rate": 0.0001559192903470747, + "loss": 0.2537, + "step": 2385 + }, + { + "epoch": 6.372588157019295, + "grad_norm": 26.35301399230957, + "learning_rate": 0.00015574377824892777, + "loss": 0.1868, + "step": 2390 + }, + { + "epoch": 6.385894876912841, + "grad_norm": 10.239131927490234, + "learning_rate": 0.00015556801666656312, + "loss": 0.4423, + "step": 2395 + }, + { + "epoch": 6.399201596806387, + "grad_norm": 0.012938698753714561, + "learning_rate": 0.00015539200638661104, + "loss": 0.0325, + "step": 2400 + }, + { + "epoch": 6.412508316699934, + "grad_norm": 0.37707385420799255, + "learning_rate": 0.0001552157481968148, + "loss": 0.12, + "step": 2405 + }, + { + "epoch": 6.425815036593479, + "grad_norm": 31.20591926574707, + "learning_rate": 0.00015503924288602715, + "loss": 0.0575, + "step": 2410 + }, + { + "epoch": 6.439121756487026, + "grad_norm": 16.725509643554688, + "learning_rate": 0.000154862491244207, + "loss": 0.3234, + "step": 2415 + }, + { + "epoch": 6.452428476380573, + "grad_norm": 2.5812032222747803, + "learning_rate": 0.0001546854940624156, + "loss": 0.1438, + "step": 2420 + }, + { + "epoch": 6.465735196274118, + "grad_norm": 7.374899864196777, + "learning_rate": 0.00015450825213281317, + "loss": 0.3837, + "step": 2425 + }, + { + "epoch": 6.479041916167665, + "grad_norm": 0.6281996369361877, + "learning_rate": 0.00015433076624865531, + "loss": 0.2103, + "step": 2430 + }, + { + "epoch": 6.492348636061211, + "grad_norm": 8.941553115844727, + "learning_rate": 0.00015415303720428945, + "loss": 0.5357, + "step": 2435 + }, + { + "epoch": 6.505655355954757, + "grad_norm": 0.4060121774673462, + "learning_rate": 0.0001539750657951513, + "loss": 0.2006, + "step": 2440 + }, + { + "epoch": 6.518962075848304, + "grad_norm": 30.6502628326416, + "learning_rate": 0.00015379685281776125, + "loss": 0.2649, + "step": 2445 + }, + { + "epoch": 6.5322687957418495, + "grad_norm": 0.9821872711181641, + "learning_rate": 0.00015361839906972096, + "loss": 0.1418, + "step": 2450 + }, + { + "epoch": 6.545575515635396, + "grad_norm": 2.888770341873169, + "learning_rate": 0.00015343970534970947, + "loss": 0.2907, + "step": 2455 + }, + { + "epoch": 6.558882235528942, + "grad_norm": 0.7967215180397034, + "learning_rate": 0.00015326077245747999, + "loss": 0.0516, + "step": 2460 + }, + { + "epoch": 6.572188955422488, + "grad_norm": 38.39204788208008, + "learning_rate": 0.00015308160119385615, + "loss": 0.154, + "step": 2465 + }, + { + "epoch": 6.585495675316035, + "grad_norm": 1.5320662260055542, + "learning_rate": 0.00015290219236072835, + "loss": 0.0774, + "step": 2470 + }, + { + "epoch": 6.598802395209581, + "grad_norm": 0.0738963857293129, + "learning_rate": 0.00015272254676105025, + "loss": 0.0356, + "step": 2475 + }, + { + "epoch": 6.612109115103127, + "grad_norm": 53.34037399291992, + "learning_rate": 0.00015254266519883525, + "loss": 0.2408, + "step": 2480 + }, + { + "epoch": 6.625415834996673, + "grad_norm": 1.6895263195037842, + "learning_rate": 0.00015236254847915274, + "loss": 0.1396, + "step": 2485 + }, + { + "epoch": 6.63872255489022, + "grad_norm": 2.0314857959747314, + "learning_rate": 0.0001521821974081246, + "loss": 0.0907, + "step": 2490 + }, + { + "epoch": 6.652029274783766, + "grad_norm": 6.590915679931641, + "learning_rate": 0.00015200161279292155, + "loss": 0.1152, + "step": 2495 + }, + { + "epoch": 6.665335994677312, + "grad_norm": 11.798026084899902, + "learning_rate": 0.00015182079544175955, + "loss": 0.0845, + "step": 2500 + }, + { + "epoch": 6.6786427145708585, + "grad_norm": 2.838351011276245, + "learning_rate": 0.0001516397461638962, + "loss": 0.3289, + "step": 2505 + }, + { + "epoch": 6.691949434464404, + "grad_norm": 1.8574706315994263, + "learning_rate": 0.0001514584657696271, + "loss": 0.1179, + "step": 2510 + }, + { + "epoch": 6.705256154357951, + "grad_norm": 2.1909406185150146, + "learning_rate": 0.00015127695507028213, + "loss": 0.2805, + "step": 2515 + }, + { + "epoch": 6.718562874251497, + "grad_norm": 0.1210150420665741, + "learning_rate": 0.00015109521487822206, + "loss": 0.0867, + "step": 2520 + }, + { + "epoch": 6.731869594145043, + "grad_norm": 14.33542537689209, + "learning_rate": 0.00015091324600683472, + "loss": 0.2439, + "step": 2525 + }, + { + "epoch": 6.74517631403859, + "grad_norm": 26.803274154663086, + "learning_rate": 0.0001507310492705313, + "loss": 0.1219, + "step": 2530 + }, + { + "epoch": 6.758483033932135, + "grad_norm": 0.3590647578239441, + "learning_rate": 0.000150548625484743, + "loss": 0.193, + "step": 2535 + }, + { + "epoch": 6.771789753825682, + "grad_norm": 0.17284749448299408, + "learning_rate": 0.00015036597546591699, + "loss": 0.0963, + "step": 2540 + }, + { + "epoch": 6.7850964737192285, + "grad_norm": 2.4201717376708984, + "learning_rate": 0.00015018310003151312, + "loss": 0.1209, + "step": 2545 + }, + { + "epoch": 6.798403193612774, + "grad_norm": 1.2542929649353027, + "learning_rate": 0.00015000000000000001, + "loss": 0.2455, + "step": 2550 + }, + { + "epoch": 6.811709913506321, + "grad_norm": 0.8970535397529602, + "learning_rate": 0.0001498166761908515, + "loss": 0.0261, + "step": 2555 + }, + { + "epoch": 6.825016633399867, + "grad_norm": 3.8939568996429443, + "learning_rate": 0.000149633129424543, + "loss": 0.3337, + "step": 2560 + }, + { + "epoch": 6.838323353293413, + "grad_norm": 14.508380889892578, + "learning_rate": 0.0001494493605225477, + "loss": 0.1373, + "step": 2565 + }, + { + "epoch": 6.85163007318696, + "grad_norm": 0.10055989772081375, + "learning_rate": 0.00014926537030733302, + "loss": 0.1465, + "step": 2570 + }, + { + "epoch": 6.864936793080505, + "grad_norm": 0.039826489984989166, + "learning_rate": 0.00014908115960235682, + "loss": 0.1221, + "step": 2575 + }, + { + "epoch": 6.878243512974052, + "grad_norm": 0.798770010471344, + "learning_rate": 0.0001488967292320639, + "loss": 0.118, + "step": 2580 + }, + { + "epoch": 6.891550232867598, + "grad_norm": 0.03231286630034447, + "learning_rate": 0.00014871208002188203, + "loss": 0.1685, + "step": 2585 + }, + { + "epoch": 6.904856952761144, + "grad_norm": 42.45045471191406, + "learning_rate": 0.00014852721279821852, + "loss": 0.3227, + "step": 2590 + }, + { + "epoch": 6.918163672654691, + "grad_norm": 19.435428619384766, + "learning_rate": 0.00014834212838845637, + "loss": 0.0843, + "step": 2595 + }, + { + "epoch": 6.931470392548237, + "grad_norm": 26.672100067138672, + "learning_rate": 0.00014815682762095065, + "loss": 0.1584, + "step": 2600 + }, + { + "epoch": 6.944777112441783, + "grad_norm": 13.749527931213379, + "learning_rate": 0.00014797131132502465, + "loss": 0.1462, + "step": 2605 + }, + { + "epoch": 6.95808383233533, + "grad_norm": 0.3556053638458252, + "learning_rate": 0.00014778558033096633, + "loss": 0.5992, + "step": 2610 + }, + { + "epoch": 6.9713905522288755, + "grad_norm": 1.5534833669662476, + "learning_rate": 0.00014759963547002458, + "loss": 0.369, + "step": 2615 + }, + { + "epoch": 6.984697272122422, + "grad_norm": 0.774702787399292, + "learning_rate": 0.0001474134775744054, + "loss": 0.2468, + "step": 2620 + }, + { + "epoch": 6.998003992015968, + "grad_norm": 3.043125867843628, + "learning_rate": 0.0001472271074772683, + "loss": 0.1326, + "step": 2625 + }, + { + "epoch": 6.998003992015968, + "eval_loss": 0.7454941868782043, + "eval_macro_f1": 68.01755519864683, + "eval_macro_precision": 69.74014647112368, + "eval_macro_recall": 67.37265826177689, + "eval_micro_f1": 87.12574850299401, + "eval_micro_precision": 87.12574850299401, + "eval_micro_recall": 87.12574850299401, + "eval_runtime": 4.9043, + "eval_samples_per_second": 544.832, + "eval_steps_per_second": 34.052, + "step": 2625 + }, + { + "epoch": 7.013306719893547, + "grad_norm": 3.137073516845703, + "learning_rate": 0.00014704052601272242, + "loss": 0.0919, + "step": 2630 + }, + { + "epoch": 7.026613439787092, + "grad_norm": 0.6686985492706299, + "learning_rate": 0.00014685373401582296, + "loss": 0.0532, + "step": 2635 + }, + { + "epoch": 7.039920159680639, + "grad_norm": 0.02329118922352791, + "learning_rate": 0.00014666673232256738, + "loss": 0.0643, + "step": 2640 + }, + { + "epoch": 7.053226879574185, + "grad_norm": 1.519888997077942, + "learning_rate": 0.0001464795217698916, + "loss": 0.0942, + "step": 2645 + }, + { + "epoch": 7.066533599467731, + "grad_norm": 4.10860538482666, + "learning_rate": 0.00014629210319566627, + "loss": 0.0289, + "step": 2650 + }, + { + "epoch": 7.079840319361278, + "grad_norm": 3.4810853004455566, + "learning_rate": 0.00014610447743869314, + "loss": 0.3145, + "step": 2655 + }, + { + "epoch": 7.0931470392548235, + "grad_norm": 3.5535147190093994, + "learning_rate": 0.00014591664533870118, + "loss": 0.089, + "step": 2660 + }, + { + "epoch": 7.10645375914837, + "grad_norm": 0.7342696785926819, + "learning_rate": 0.00014572860773634286, + "loss": 0.0405, + "step": 2665 + }, + { + "epoch": 7.119760479041916, + "grad_norm": 0.0036852105986326933, + "learning_rate": 0.00014554036547319033, + "loss": 0.1064, + "step": 2670 + }, + { + "epoch": 7.133067198935462, + "grad_norm": 0.3384684920310974, + "learning_rate": 0.00014535191939173177, + "loss": 0.1589, + "step": 2675 + }, + { + "epoch": 7.146373918829009, + "grad_norm": 24.363168716430664, + "learning_rate": 0.0001451632703353676, + "loss": 0.1389, + "step": 2680 + }, + { + "epoch": 7.159680638722555, + "grad_norm": 0.233006089925766, + "learning_rate": 0.0001449744191484066, + "loss": 0.2027, + "step": 2685 + }, + { + "epoch": 7.172987358616101, + "grad_norm": 0.4666266143321991, + "learning_rate": 0.00014478536667606218, + "loss": 0.1332, + "step": 2690 + }, + { + "epoch": 7.186294078509647, + "grad_norm": 1.2150986194610596, + "learning_rate": 0.00014459611376444864, + "loss": 0.1118, + "step": 2695 + }, + { + "epoch": 7.199600798403194, + "grad_norm": 2.3224456310272217, + "learning_rate": 0.00014440666126057744, + "loss": 0.2385, + "step": 2700 + }, + { + "epoch": 7.21290751829674, + "grad_norm": 2.5107779502868652, + "learning_rate": 0.00014421701001235315, + "loss": 0.1927, + "step": 2705 + }, + { + "epoch": 7.226214238190286, + "grad_norm": 11.970067024230957, + "learning_rate": 0.00014402716086856998, + "loss": 0.1681, + "step": 2710 + }, + { + "epoch": 7.2395209580838324, + "grad_norm": 2.186711072921753, + "learning_rate": 0.00014383711467890774, + "loss": 0.1937, + "step": 2715 + }, + { + "epoch": 7.252827677977379, + "grad_norm": 0.2929200828075409, + "learning_rate": 0.00014364687229392824, + "loss": 0.0433, + "step": 2720 + }, + { + "epoch": 7.266134397870925, + "grad_norm": 1.6432567834854126, + "learning_rate": 0.00014345643456507124, + "loss": 0.288, + "step": 2725 + }, + { + "epoch": 7.279441117764471, + "grad_norm": 17.002174377441406, + "learning_rate": 0.00014326580234465085, + "loss": 0.1176, + "step": 2730 + }, + { + "epoch": 7.292747837658017, + "grad_norm": 1.520616888999939, + "learning_rate": 0.00014307497648585163, + "loss": 0.1465, + "step": 2735 + }, + { + "epoch": 7.306054557551564, + "grad_norm": 1.1925784349441528, + "learning_rate": 0.0001428839578427247, + "loss": 0.1364, + "step": 2740 + }, + { + "epoch": 7.319361277445109, + "grad_norm": 0.4294731616973877, + "learning_rate": 0.0001426927472701842, + "loss": 0.1293, + "step": 2745 + }, + { + "epoch": 7.332667997338656, + "grad_norm": 1.5070288181304932, + "learning_rate": 0.000142501345624003, + "loss": 0.074, + "step": 2750 + }, + { + "epoch": 7.3459747172322025, + "grad_norm": 0.8141427636146545, + "learning_rate": 0.00014230975376080935, + "loss": 0.1783, + "step": 2755 + }, + { + "epoch": 7.359281437125748, + "grad_norm": 20.88873291015625, + "learning_rate": 0.00014211797253808268, + "loss": 0.2202, + "step": 2760 + }, + { + "epoch": 7.372588157019295, + "grad_norm": 29.461524963378906, + "learning_rate": 0.00014192600281414994, + "loss": 0.3192, + "step": 2765 + }, + { + "epoch": 7.385894876912841, + "grad_norm": 0.10030537843704224, + "learning_rate": 0.0001417338454481818, + "loss": 0.2397, + "step": 2770 + }, + { + "epoch": 7.399201596806387, + "grad_norm": 0.23451067507266998, + "learning_rate": 0.00014154150130018866, + "loss": 0.0601, + "step": 2775 + }, + { + "epoch": 7.412508316699934, + "grad_norm": 3.4268810749053955, + "learning_rate": 0.00014134897123101688, + "loss": 0.0454, + "step": 2780 + }, + { + "epoch": 7.425815036593479, + "grad_norm": 32.5483283996582, + "learning_rate": 0.00014115625610234495, + "loss": 0.2532, + "step": 2785 + }, + { + "epoch": 7.439121756487026, + "grad_norm": 0.9835365414619446, + "learning_rate": 0.00014096335677667954, + "loss": 0.1134, + "step": 2790 + }, + { + "epoch": 7.452428476380573, + "grad_norm": 4.208250522613525, + "learning_rate": 0.00014077027411735183, + "loss": 0.0956, + "step": 2795 + }, + { + "epoch": 7.465735196274118, + "grad_norm": 3.528183937072754, + "learning_rate": 0.0001405770089885134, + "loss": 0.1693, + "step": 2800 + }, + { + "epoch": 7.479041916167665, + "grad_norm": 5.837916851043701, + "learning_rate": 0.00014038356225513248, + "loss": 0.1955, + "step": 2805 + }, + { + "epoch": 7.492348636061211, + "grad_norm": 11.988131523132324, + "learning_rate": 0.00014018993478299017, + "loss": 0.1545, + "step": 2810 + }, + { + "epoch": 7.505655355954757, + "grad_norm": 0.6364012360572815, + "learning_rate": 0.00013999612743867643, + "loss": 0.1219, + "step": 2815 + }, + { + "epoch": 7.518962075848304, + "grad_norm": 0.8442111611366272, + "learning_rate": 0.00013980214108958624, + "loss": 0.2948, + "step": 2820 + }, + { + "epoch": 7.5322687957418495, + "grad_norm": 4.432058334350586, + "learning_rate": 0.0001396079766039157, + "loss": 0.0897, + "step": 2825 + }, + { + "epoch": 7.545575515635396, + "grad_norm": 31.221689224243164, + "learning_rate": 0.00013941363485065822, + "loss": 0.2486, + "step": 2830 + }, + { + "epoch": 7.558882235528942, + "grad_norm": 0.8469045162200928, + "learning_rate": 0.00013921911669960055, + "loss": 0.0861, + "step": 2835 + }, + { + "epoch": 7.572188955422488, + "grad_norm": 1.0864439010620117, + "learning_rate": 0.00013902442302131894, + "loss": 0.1102, + "step": 2840 + }, + { + "epoch": 7.585495675316035, + "grad_norm": 42.718570709228516, + "learning_rate": 0.00013882955468717524, + "loss": 0.0893, + "step": 2845 + }, + { + "epoch": 7.598802395209581, + "grad_norm": 7.022069931030273, + "learning_rate": 0.00013863451256931287, + "loss": 0.1172, + "step": 2850 + }, + { + "epoch": 7.612109115103127, + "grad_norm": 0.48820632696151733, + "learning_rate": 0.0001384392975406532, + "loss": 0.0843, + "step": 2855 + }, + { + "epoch": 7.625415834996673, + "grad_norm": 0.0070191072300076485, + "learning_rate": 0.00013824391047489128, + "loss": 0.1253, + "step": 2860 + }, + { + "epoch": 7.63872255489022, + "grad_norm": 0.38560786843299866, + "learning_rate": 0.0001380483522464923, + "loss": 0.0797, + "step": 2865 + }, + { + "epoch": 7.652029274783766, + "grad_norm": 5.101590156555176, + "learning_rate": 0.0001378526237306874, + "loss": 0.2165, + "step": 2870 + }, + { + "epoch": 7.665335994677312, + "grad_norm": 4.549017429351807, + "learning_rate": 0.00013765672580346987, + "loss": 0.5324, + "step": 2875 + }, + { + "epoch": 7.6786427145708585, + "grad_norm": 0.4050997495651245, + "learning_rate": 0.00013746065934159123, + "loss": 0.1377, + "step": 2880 + }, + { + "epoch": 7.691949434464404, + "grad_norm": 0.03987701237201691, + "learning_rate": 0.0001372644252225572, + "loss": 0.0435, + "step": 2885 + }, + { + "epoch": 7.705256154357951, + "grad_norm": 13.718110084533691, + "learning_rate": 0.00013706802432462395, + "loss": 0.4619, + "step": 2890 + }, + { + "epoch": 7.718562874251497, + "grad_norm": 0.03140031918883324, + "learning_rate": 0.0001368714575267941, + "loss": 0.0112, + "step": 2895 + }, + { + "epoch": 7.731869594145043, + "grad_norm": 0.15146759152412415, + "learning_rate": 0.00013667472570881264, + "loss": 0.0316, + "step": 2900 + }, + { + "epoch": 7.74517631403859, + "grad_norm": 0.6249554753303528, + "learning_rate": 0.00013647782975116326, + "loss": 0.1377, + "step": 2905 + }, + { + "epoch": 7.758483033932135, + "grad_norm": 0.06544558703899384, + "learning_rate": 0.0001362807705350641, + "loss": 0.1871, + "step": 2910 + }, + { + "epoch": 7.771789753825682, + "grad_norm": 3.160745143890381, + "learning_rate": 0.0001360835489424642, + "loss": 0.1802, + "step": 2915 + }, + { + "epoch": 7.7850964737192285, + "grad_norm": 13.635746002197266, + "learning_rate": 0.00013588616585603907, + "loss": 0.2655, + "step": 2920 + }, + { + "epoch": 7.798403193612774, + "grad_norm": 19.401350021362305, + "learning_rate": 0.00013568862215918717, + "loss": 0.1116, + "step": 2925 + }, + { + "epoch": 7.811709913506321, + "grad_norm": 0.4326021671295166, + "learning_rate": 0.00013549091873602578, + "loss": 0.1016, + "step": 2930 + }, + { + "epoch": 7.825016633399867, + "grad_norm": 2.293384075164795, + "learning_rate": 0.00013529305647138687, + "loss": 0.0957, + "step": 2935 + }, + { + "epoch": 7.838323353293413, + "grad_norm": 0.6911963224411011, + "learning_rate": 0.00013509503625081358, + "loss": 0.1346, + "step": 2940 + }, + { + "epoch": 7.85163007318696, + "grad_norm": 3.5838940143585205, + "learning_rate": 0.00013489685896055572, + "loss": 0.1863, + "step": 2945 + }, + { + "epoch": 7.864936793080505, + "grad_norm": 0.15642301738262177, + "learning_rate": 0.00013469852548756624, + "loss": 0.2231, + "step": 2950 + }, + { + "epoch": 7.878243512974052, + "grad_norm": 12.195527076721191, + "learning_rate": 0.00013450003671949706, + "loss": 0.1953, + "step": 2955 + }, + { + "epoch": 7.891550232867598, + "grad_norm": 2.0001468658447266, + "learning_rate": 0.00013430139354469515, + "loss": 0.1195, + "step": 2960 + }, + { + "epoch": 7.904856952761144, + "grad_norm": 11.265728950500488, + "learning_rate": 0.00013410259685219845, + "loss": 0.2233, + "step": 2965 + }, + { + "epoch": 7.918163672654691, + "grad_norm": 0.5948832631111145, + "learning_rate": 0.00013390364753173206, + "loss": 0.0816, + "step": 2970 + }, + { + "epoch": 7.931470392548237, + "grad_norm": 0.09405430406332016, + "learning_rate": 0.00013370454647370418, + "loss": 0.0419, + "step": 2975 + }, + { + "epoch": 7.944777112441783, + "grad_norm": 0.07850921154022217, + "learning_rate": 0.00013350529456920206, + "loss": 0.1481, + "step": 2980 + }, + { + "epoch": 7.95808383233533, + "grad_norm": 0.05034489929676056, + "learning_rate": 0.00013330589270998808, + "loss": 0.0068, + "step": 2985 + }, + { + "epoch": 7.9713905522288755, + "grad_norm": 2.067530632019043, + "learning_rate": 0.0001331063417884958, + "loss": 0.2244, + "step": 2990 + }, + { + "epoch": 7.984697272122422, + "grad_norm": 0.05471380800008774, + "learning_rate": 0.0001329066426978259, + "loss": 0.0941, + "step": 2995 + }, + { + "epoch": 7.998003992015968, + "grad_norm": 5.7279133796691895, + "learning_rate": 0.00013270679633174218, + "loss": 0.4085, + "step": 3000 + }, + { + "epoch": 7.998003992015968, + "eval_loss": 0.9578362107276917, + "eval_macro_f1": 67.62551780945887, + "eval_macro_precision": 73.99170552196239, + "eval_macro_recall": 64.56609836337587, + "eval_micro_f1": 88.02395209580838, + "eval_micro_precision": 88.02395209580838, + "eval_micro_recall": 88.02395209580838, + "eval_runtime": 4.5288, + "eval_samples_per_second": 590.002, + "eval_steps_per_second": 36.875, + "step": 3000 + }, + { + "epoch": 8.013306719893546, + "grad_norm": 23.61517333984375, + "learning_rate": 0.00013250680358466754, + "loss": 0.2144, + "step": 3005 + }, + { + "epoch": 8.026613439787093, + "grad_norm": 1.6236063241958618, + "learning_rate": 0.0001323066653516801, + "loss": 0.145, + "step": 3010 + }, + { + "epoch": 8.039920159680639, + "grad_norm": 0.12873251736164093, + "learning_rate": 0.00013210638252850908, + "loss": 0.0148, + "step": 3015 + }, + { + "epoch": 8.053226879574185, + "grad_norm": 1.4208223819732666, + "learning_rate": 0.0001319059560115308, + "loss": 0.1435, + "step": 3020 + }, + { + "epoch": 8.066533599467732, + "grad_norm": 1.1595903635025024, + "learning_rate": 0.00013170538669776468, + "loss": 0.0784, + "step": 3025 + }, + { + "epoch": 8.079840319361278, + "grad_norm": 2.5915064811706543, + "learning_rate": 0.0001315046754848693, + "loss": 0.092, + "step": 3030 + }, + { + "epoch": 8.093147039254823, + "grad_norm": 1.323926329612732, + "learning_rate": 0.00013130382327113823, + "loss": 0.0972, + "step": 3035 + }, + { + "epoch": 8.10645375914837, + "grad_norm": 1.1341009140014648, + "learning_rate": 0.00013110283095549614, + "loss": 0.0939, + "step": 3040 + }, + { + "epoch": 8.119760479041917, + "grad_norm": 0.8324944972991943, + "learning_rate": 0.00013090169943749476, + "loss": 0.1248, + "step": 3045 + }, + { + "epoch": 8.133067198935462, + "grad_norm": 0.12022148072719574, + "learning_rate": 0.00013070042961730877, + "loss": 0.0167, + "step": 3050 + }, + { + "epoch": 8.146373918829008, + "grad_norm": 0.08643687516450882, + "learning_rate": 0.00013049902239573187, + "loss": 0.0794, + "step": 3055 + }, + { + "epoch": 8.159680638722556, + "grad_norm": 0.017748689278960228, + "learning_rate": 0.00013029747867417276, + "loss": 0.0169, + "step": 3060 + }, + { + "epoch": 8.172987358616101, + "grad_norm": 0.03752296790480614, + "learning_rate": 0.00013009579935465085, + "loss": 0.0268, + "step": 3065 + }, + { + "epoch": 8.186294078509647, + "grad_norm": 0.5308043956756592, + "learning_rate": 0.0001298939853397927, + "loss": 0.1967, + "step": 3070 + }, + { + "epoch": 8.199600798403194, + "grad_norm": 2.3242459297180176, + "learning_rate": 0.0001296920375328275, + "loss": 0.0467, + "step": 3075 + }, + { + "epoch": 8.21290751829674, + "grad_norm": 0.10183367133140564, + "learning_rate": 0.0001294899568375833, + "loss": 0.0591, + "step": 3080 + }, + { + "epoch": 8.226214238190286, + "grad_norm": 0.013796583749353886, + "learning_rate": 0.00012928774415848295, + "loss": 0.1172, + "step": 3085 + }, + { + "epoch": 8.239520958083832, + "grad_norm": 0.060439061373472214, + "learning_rate": 0.0001290854004005399, + "loss": 0.1231, + "step": 3090 + }, + { + "epoch": 8.252827677977379, + "grad_norm": 0.04022917151451111, + "learning_rate": 0.0001288829264693544, + "loss": 0.0047, + "step": 3095 + }, + { + "epoch": 8.266134397870925, + "grad_norm": 1.164883017539978, + "learning_rate": 0.00012868032327110904, + "loss": 0.0269, + "step": 3100 + }, + { + "epoch": 8.27944111776447, + "grad_norm": 13.31337833404541, + "learning_rate": 0.00012847759171256523, + "loss": 0.1548, + "step": 3105 + }, + { + "epoch": 8.292747837658018, + "grad_norm": 0.5353259444236755, + "learning_rate": 0.00012827473270105873, + "loss": 0.0542, + "step": 3110 + }, + { + "epoch": 8.306054557551564, + "grad_norm": 0.004808424971997738, + "learning_rate": 0.00012807174714449571, + "loss": 0.0818, + "step": 3115 + }, + { + "epoch": 8.31936127744511, + "grad_norm": 0.9103612899780273, + "learning_rate": 0.0001278686359513488, + "loss": 0.0221, + "step": 3120 + }, + { + "epoch": 8.332667997338657, + "grad_norm": 48.99919128417969, + "learning_rate": 0.0001276654000306527, + "loss": 0.2673, + "step": 3125 + }, + { + "epoch": 8.345974717232203, + "grad_norm": 1.4264007806777954, + "learning_rate": 0.00012746204029200067, + "loss": 0.1625, + "step": 3130 + }, + { + "epoch": 8.359281437125748, + "grad_norm": 15.919877052307129, + "learning_rate": 0.0001272585576455398, + "loss": 0.1033, + "step": 3135 + }, + { + "epoch": 8.372588157019294, + "grad_norm": 49.93979263305664, + "learning_rate": 0.00012705495300196747, + "loss": 0.2639, + "step": 3140 + }, + { + "epoch": 8.385894876912841, + "grad_norm": 1.7972381114959717, + "learning_rate": 0.00012685122727252695, + "loss": 0.0508, + "step": 3145 + }, + { + "epoch": 8.399201596806387, + "grad_norm": 0.6764810085296631, + "learning_rate": 0.00012664738136900348, + "loss": 0.1182, + "step": 3150 + }, + { + "epoch": 8.412508316699933, + "grad_norm": 2.4136905670166016, + "learning_rate": 0.00012644341620372023, + "loss": 0.1769, + "step": 3155 + }, + { + "epoch": 8.42581503659348, + "grad_norm": 0.28830185532569885, + "learning_rate": 0.00012623933268953396, + "loss": 0.0763, + "step": 3160 + }, + { + "epoch": 8.439121756487026, + "grad_norm": 1.7211086750030518, + "learning_rate": 0.0001260351317398312, + "loss": 0.0386, + "step": 3165 + }, + { + "epoch": 8.452428476380572, + "grad_norm": 0.05553643777966499, + "learning_rate": 0.00012583081426852411, + "loss": 0.0522, + "step": 3170 + }, + { + "epoch": 8.46573519627412, + "grad_norm": 1.4411470890045166, + "learning_rate": 0.00012562638119004626, + "loss": 0.0946, + "step": 3175 + }, + { + "epoch": 8.479041916167665, + "grad_norm": 0.201587975025177, + "learning_rate": 0.00012542183341934872, + "loss": 0.0996, + "step": 3180 + }, + { + "epoch": 8.49234863606121, + "grad_norm": 1.4264519214630127, + "learning_rate": 0.00012521717187189573, + "loss": 0.2037, + "step": 3185 + }, + { + "epoch": 8.505655355954758, + "grad_norm": 2.3621273040771484, + "learning_rate": 0.00012501239746366092, + "loss": 0.0675, + "step": 3190 + }, + { + "epoch": 8.518962075848304, + "grad_norm": 1.260345458984375, + "learning_rate": 0.0001248075111111229, + "loss": 0.1723, + "step": 3195 + }, + { + "epoch": 8.53226879574185, + "grad_norm": 0.44682565331459045, + "learning_rate": 0.00012460251373126136, + "loss": 0.336, + "step": 3200 + }, + { + "epoch": 8.545575515635395, + "grad_norm": 0.35291796922683716, + "learning_rate": 0.00012439740624155284, + "loss": 0.0561, + "step": 3205 + }, + { + "epoch": 8.558882235528943, + "grad_norm": 0.7354090809822083, + "learning_rate": 0.00012419218955996676, + "loss": 0.2095, + "step": 3210 + }, + { + "epoch": 8.572188955422488, + "grad_norm": 2.9078779220581055, + "learning_rate": 0.00012398686460496122, + "loss": 0.0378, + "step": 3215 + }, + { + "epoch": 8.585495675316034, + "grad_norm": 0.8109474778175354, + "learning_rate": 0.00012378143229547882, + "loss": 0.236, + "step": 3220 + }, + { + "epoch": 8.598802395209582, + "grad_norm": 2.6716489791870117, + "learning_rate": 0.00012357589355094275, + "loss": 0.0557, + "step": 3225 + }, + { + "epoch": 8.612109115103127, + "grad_norm": 36.927249908447266, + "learning_rate": 0.00012337024929125242, + "loss": 0.1876, + "step": 3230 + }, + { + "epoch": 8.625415834996673, + "grad_norm": 1.459558129310608, + "learning_rate": 0.0001231645004367796, + "loss": 0.1989, + "step": 3235 + }, + { + "epoch": 8.638722554890219, + "grad_norm": 1.2060911655426025, + "learning_rate": 0.0001229586479083641, + "loss": 0.0587, + "step": 3240 + }, + { + "epoch": 8.652029274783766, + "grad_norm": 0.03878104314208031, + "learning_rate": 0.00012275269262730982, + "loss": 0.026, + "step": 3245 + }, + { + "epoch": 8.665335994677312, + "grad_norm": 0.032785095274448395, + "learning_rate": 0.00012254663551538046, + "loss": 0.1203, + "step": 3250 + }, + { + "epoch": 8.678642714570858, + "grad_norm": 0.01734250970184803, + "learning_rate": 0.00012234047749479544, + "loss": 0.0483, + "step": 3255 + }, + { + "epoch": 8.691949434464405, + "grad_norm": 0.004084521438926458, + "learning_rate": 0.0001221342194882259, + "loss": 0.1555, + "step": 3260 + }, + { + "epoch": 8.70525615435795, + "grad_norm": 0.4270322620868683, + "learning_rate": 0.00012192786241879033, + "loss": 0.0689, + "step": 3265 + }, + { + "epoch": 8.718562874251496, + "grad_norm": 73.75556945800781, + "learning_rate": 0.00012172140721005079, + "loss": 0.0641, + "step": 3270 + }, + { + "epoch": 8.731869594145044, + "grad_norm": 15.693498611450195, + "learning_rate": 0.00012151485478600839, + "loss": 0.2614, + "step": 3275 + }, + { + "epoch": 8.74517631403859, + "grad_norm": 0.04169188439846039, + "learning_rate": 0.00012130820607109936, + "loss": 0.0439, + "step": 3280 + }, + { + "epoch": 8.758483033932135, + "grad_norm": 0.4641147553920746, + "learning_rate": 0.000121101461990191, + "loss": 0.0238, + "step": 3285 + }, + { + "epoch": 8.771789753825683, + "grad_norm": 3.9536852836608887, + "learning_rate": 0.00012089462346857725, + "loss": 0.1257, + "step": 3290 + }, + { + "epoch": 8.785096473719229, + "grad_norm": 2.0397369861602783, + "learning_rate": 0.00012068769143197487, + "loss": 0.1766, + "step": 3295 + }, + { + "epoch": 8.798403193612774, + "grad_norm": 30.004472732543945, + "learning_rate": 0.00012048066680651908, + "loss": 0.0752, + "step": 3300 + }, + { + "epoch": 8.81170991350632, + "grad_norm": 1.0506658554077148, + "learning_rate": 0.0001202735505187595, + "loss": 0.2345, + "step": 3305 + }, + { + "epoch": 8.825016633399867, + "grad_norm": 1.2177345752716064, + "learning_rate": 0.00012006634349565602, + "loss": 0.0594, + "step": 3310 + }, + { + "epoch": 8.838323353293413, + "grad_norm": 0.415720134973526, + "learning_rate": 0.00011985904666457455, + "loss": 0.0283, + "step": 3315 + }, + { + "epoch": 8.851630073186959, + "grad_norm": 0.6650182604789734, + "learning_rate": 0.00011965166095328301, + "loss": 0.0981, + "step": 3320 + }, + { + "epoch": 8.864936793080506, + "grad_norm": 129.6624298095703, + "learning_rate": 0.00011944418728994709, + "loss": 0.1986, + "step": 3325 + }, + { + "epoch": 8.878243512974052, + "grad_norm": 18.93353843688965, + "learning_rate": 0.00011923662660312611, + "loss": 0.188, + "step": 3330 + }, + { + "epoch": 8.891550232867598, + "grad_norm": 0.8205522298812866, + "learning_rate": 0.00011902897982176882, + "loss": 0.0841, + "step": 3335 + }, + { + "epoch": 8.904856952761145, + "grad_norm": 32.96717071533203, + "learning_rate": 0.00011882124787520934, + "loss": 0.0548, + "step": 3340 + }, + { + "epoch": 8.918163672654691, + "grad_norm": 18.162229537963867, + "learning_rate": 0.00011861343169316301, + "loss": 0.0967, + "step": 3345 + }, + { + "epoch": 8.931470392548237, + "grad_norm": 0.480948269367218, + "learning_rate": 0.00011840553220572204, + "loss": 0.0964, + "step": 3350 + }, + { + "epoch": 8.944777112441782, + "grad_norm": 0.0540715716779232, + "learning_rate": 0.0001181975503433516, + "loss": 0.0041, + "step": 3355 + }, + { + "epoch": 8.95808383233533, + "grad_norm": 0.6189212799072266, + "learning_rate": 0.00011798948703688539, + "loss": 0.0134, + "step": 3360 + }, + { + "epoch": 8.971390552228875, + "grad_norm": 0.002311081625521183, + "learning_rate": 0.00011778134321752182, + "loss": 0.1133, + "step": 3365 + }, + { + "epoch": 8.984697272122421, + "grad_norm": 19.883926391601562, + "learning_rate": 0.00011757311981681942, + "loss": 0.072, + "step": 3370 + }, + { + "epoch": 8.998003992015969, + "grad_norm": 2.3699491024017334, + "learning_rate": 0.00011736481776669306, + "loss": 0.0273, + "step": 3375 + }, + { + "epoch": 8.998003992015969, + "eval_loss": 1.5414294004440308, + "eval_macro_f1": 64.80803991978306, + "eval_macro_precision": 70.6655091690804, + "eval_macro_recall": 61.210875301941044, + "eval_micro_f1": 87.12574850299401, + "eval_micro_precision": 87.12574850299401, + "eval_micro_recall": 87.12574850299401, + "eval_runtime": 5.1705, + "eval_samples_per_second": 516.777, + "eval_steps_per_second": 32.299, + "step": 3375 + }, + { + "epoch": 9.013306719893546, + "grad_norm": 0.21652141213417053, + "learning_rate": 0.0001171564379994095, + "loss": 0.0913, + "step": 3380 + }, + { + "epoch": 9.026613439787093, + "grad_norm": 0.7938780188560486, + "learning_rate": 0.0001169479814475834, + "loss": 0.0546, + "step": 3385 + }, + { + "epoch": 9.039920159680639, + "grad_norm": 0.004158825613558292, + "learning_rate": 0.00011673944904417308, + "loss": 0.102, + "step": 3390 + }, + { + "epoch": 9.053226879574185, + "grad_norm": 16.037822723388672, + "learning_rate": 0.00011653084172247624, + "loss": 0.3269, + "step": 3395 + }, + { + "epoch": 9.066533599467732, + "grad_norm": 12.818014144897461, + "learning_rate": 0.00011632216041612594, + "loss": 0.268, + "step": 3400 + }, + { + "epoch": 9.079840319361278, + "grad_norm": 0.016593078151345253, + "learning_rate": 0.00011611340605908642, + "loss": 0.01, + "step": 3405 + }, + { + "epoch": 9.093147039254823, + "grad_norm": 0.01048375479876995, + "learning_rate": 0.00011590457958564878, + "loss": 0.0739, + "step": 3410 + }, + { + "epoch": 9.10645375914837, + "grad_norm": 24.694929122924805, + "learning_rate": 0.0001156956819304269, + "loss": 0.0642, + "step": 3415 + }, + { + "epoch": 9.119760479041917, + "grad_norm": 38.852474212646484, + "learning_rate": 0.00011548671402835325, + "loss": 0.0525, + "step": 3420 + }, + { + "epoch": 9.133067198935462, + "grad_norm": 0.016515236347913742, + "learning_rate": 0.00011527767681467471, + "loss": 0.0399, + "step": 3425 + }, + { + "epoch": 9.146373918829008, + "grad_norm": 0.6999401450157166, + "learning_rate": 0.00011506857122494831, + "loss": 0.095, + "step": 3430 + }, + { + "epoch": 9.159680638722556, + "grad_norm": 0.023875270038843155, + "learning_rate": 0.00011485939819503717, + "loss": 0.0629, + "step": 3435 + }, + { + "epoch": 9.172987358616101, + "grad_norm": 0.0019072931027039886, + "learning_rate": 0.00011465015866110622, + "loss": 0.0449, + "step": 3440 + }, + { + "epoch": 9.186294078509647, + "grad_norm": 0.013696919195353985, + "learning_rate": 0.000114440853559618, + "loss": 0.0745, + "step": 3445 + }, + { + "epoch": 9.199600798403194, + "grad_norm": 0.05094515159726143, + "learning_rate": 0.00011423148382732853, + "loss": 0.0808, + "step": 3450 + }, + { + "epoch": 9.21290751829674, + "grad_norm": 0.33801838755607605, + "learning_rate": 0.00011402205040128307, + "loss": 0.0772, + "step": 3455 + }, + { + "epoch": 9.226214238190286, + "grad_norm": 6.502445220947266, + "learning_rate": 0.00011381255421881198, + "loss": 0.2097, + "step": 3460 + }, + { + "epoch": 9.239520958083832, + "grad_norm": 1.0928117036819458, + "learning_rate": 0.00011360299621752644, + "loss": 0.0446, + "step": 3465 + }, + { + "epoch": 9.252827677977379, + "grad_norm": 0.864954948425293, + "learning_rate": 0.00011339337733531434, + "loss": 0.088, + "step": 3470 + }, + { + "epoch": 9.266134397870925, + "grad_norm": 10.347973823547363, + "learning_rate": 0.00011318369851033603, + "loss": 0.0197, + "step": 3475 + }, + { + "epoch": 9.27944111776447, + "grad_norm": 1.25753653049469, + "learning_rate": 0.00011297396068102017, + "loss": 0.1665, + "step": 3480 + }, + { + "epoch": 9.292747837658018, + "grad_norm": 0.006565026007592678, + "learning_rate": 0.00011276416478605949, + "loss": 0.0072, + "step": 3485 + }, + { + "epoch": 9.306054557551564, + "grad_norm": 0.14825934171676636, + "learning_rate": 0.0001125543117644065, + "loss": 0.0801, + "step": 3490 + }, + { + "epoch": 9.31936127744511, + "grad_norm": 0.837486743927002, + "learning_rate": 0.00011234440255526948, + "loss": 0.1842, + "step": 3495 + }, + { + "epoch": 9.332667997338657, + "grad_norm": 4.4184064865112305, + "learning_rate": 0.0001121344380981082, + "loss": 0.1044, + "step": 3500 + }, + { + "epoch": 9.345974717232203, + "grad_norm": 0.4564765989780426, + "learning_rate": 0.00011192441933262962, + "loss": 0.0216, + "step": 3505 + }, + { + "epoch": 9.359281437125748, + "grad_norm": 4.210375785827637, + "learning_rate": 0.00011171434719878384, + "loss": 0.3018, + "step": 3510 + }, + { + "epoch": 9.372588157019294, + "grad_norm": 0.4331660866737366, + "learning_rate": 0.00011150422263675968, + "loss": 0.1116, + "step": 3515 + }, + { + "epoch": 9.385894876912841, + "grad_norm": 1.863437533378601, + "learning_rate": 0.00011129404658698081, + "loss": 0.1566, + "step": 3520 + }, + { + "epoch": 9.399201596806387, + "grad_norm": 1.0628738403320312, + "learning_rate": 0.00011108381999010111, + "loss": 0.0746, + "step": 3525 + }, + { + "epoch": 9.412508316699933, + "grad_norm": 1.0913702249526978, + "learning_rate": 0.00011087354378700086, + "loss": 0.0515, + "step": 3530 + }, + { + "epoch": 9.42581503659348, + "grad_norm": 0.40956100821495056, + "learning_rate": 0.00011066321891878227, + "loss": 0.0517, + "step": 3535 + }, + { + "epoch": 9.439121756487026, + "grad_norm": 31.597082138061523, + "learning_rate": 0.00011045284632676536, + "loss": 0.1389, + "step": 3540 + }, + { + "epoch": 9.452428476380572, + "grad_norm": 8.876389503479004, + "learning_rate": 0.00011024242695248379, + "loss": 0.1226, + "step": 3545 + }, + { + "epoch": 9.46573519627412, + "grad_norm": 0.767656683921814, + "learning_rate": 0.0001100319617376805, + "loss": 0.0444, + "step": 3550 + }, + { + "epoch": 9.479041916167665, + "grad_norm": 0.5712203979492188, + "learning_rate": 0.00010982145162430373, + "loss": 0.2447, + "step": 3555 + }, + { + "epoch": 9.49234863606121, + "grad_norm": 0.08574845641851425, + "learning_rate": 0.00010961089755450254, + "loss": 0.0436, + "step": 3560 + }, + { + "epoch": 9.505655355954758, + "grad_norm": 0.19157272577285767, + "learning_rate": 0.00010940030047062275, + "loss": 0.1011, + "step": 3565 + }, + { + "epoch": 9.518962075848304, + "grad_norm": 1.378658413887024, + "learning_rate": 0.00010918966131520277, + "loss": 0.0695, + "step": 3570 + }, + { + "epoch": 9.53226879574185, + "grad_norm": 0.7202726602554321, + "learning_rate": 0.00010897898103096917, + "loss": 0.1102, + "step": 3575 + }, + { + "epoch": 9.545575515635395, + "grad_norm": 0.020597225055098534, + "learning_rate": 0.00010876826056083273, + "loss": 0.0282, + "step": 3580 + }, + { + "epoch": 9.558882235528943, + "grad_norm": 0.7092624306678772, + "learning_rate": 0.00010855750084788398, + "loss": 0.1663, + "step": 3585 + }, + { + "epoch": 9.572188955422488, + "grad_norm": 0.05511481687426567, + "learning_rate": 0.00010834670283538914, + "loss": 0.1794, + "step": 3590 + }, + { + "epoch": 9.585495675316034, + "grad_norm": 3.0630874633789062, + "learning_rate": 0.00010813586746678583, + "loss": 0.1171, + "step": 3595 + }, + { + "epoch": 9.598802395209582, + "grad_norm": 0.7750560641288757, + "learning_rate": 0.00010792499568567884, + "loss": 0.0422, + "step": 3600 + }, + { + "epoch": 9.612109115103127, + "grad_norm": 0.020036980509757996, + "learning_rate": 0.00010771408843583598, + "loss": 0.0478, + "step": 3605 + }, + { + "epoch": 9.625415834996673, + "grad_norm": 0.4453740417957306, + "learning_rate": 0.0001075031466611837, + "loss": 0.0674, + "step": 3610 + }, + { + "epoch": 9.638722554890219, + "grad_norm": 0.08731366693973541, + "learning_rate": 0.0001072921713058031, + "loss": 0.0545, + "step": 3615 + }, + { + "epoch": 9.652029274783766, + "grad_norm": 0.90806645154953, + "learning_rate": 0.00010708116331392541, + "loss": 0.2005, + "step": 3620 + }, + { + "epoch": 9.665335994677312, + "grad_norm": 12.486852645874023, + "learning_rate": 0.0001068701236299281, + "loss": 0.0447, + "step": 3625 + }, + { + "epoch": 9.678642714570858, + "grad_norm": 0.44622567296028137, + "learning_rate": 0.00010665905319833041, + "loss": 0.0622, + "step": 3630 + }, + { + "epoch": 9.691949434464405, + "grad_norm": 0.10433344542980194, + "learning_rate": 0.00010644795296378909, + "loss": 0.0709, + "step": 3635 + }, + { + "epoch": 9.70525615435795, + "grad_norm": 0.2997910678386688, + "learning_rate": 0.00010623682387109447, + "loss": 0.2982, + "step": 3640 + }, + { + "epoch": 9.718562874251496, + "grad_norm": 1.653448462486267, + "learning_rate": 0.00010602566686516586, + "loss": 0.2432, + "step": 3645 + }, + { + "epoch": 9.731869594145044, + "grad_norm": 97.50984954833984, + "learning_rate": 0.00010581448289104758, + "loss": 0.3226, + "step": 3650 + }, + { + "epoch": 9.74517631403859, + "grad_norm": 0.7525721192359924, + "learning_rate": 0.00010560327289390468, + "loss": 0.0432, + "step": 3655 + }, + { + "epoch": 9.758483033932135, + "grad_norm": 1.433956503868103, + "learning_rate": 0.00010539203781901861, + "loss": 0.0749, + "step": 3660 + }, + { + "epoch": 9.771789753825683, + "grad_norm": 10.341765403747559, + "learning_rate": 0.00010518077861178309, + "loss": 0.0207, + "step": 3665 + }, + { + "epoch": 9.785096473719229, + "grad_norm": 0.2586815357208252, + "learning_rate": 0.00010496949621769976, + "loss": 0.0438, + "step": 3670 + }, + { + "epoch": 9.798403193612774, + "grad_norm": 0.004748414736241102, + "learning_rate": 0.00010475819158237425, + "loss": 0.0817, + "step": 3675 + }, + { + "epoch": 9.81170991350632, + "grad_norm": 0.19157640635967255, + "learning_rate": 0.00010454686565151149, + "loss": 0.0939, + "step": 3680 + }, + { + "epoch": 9.825016633399867, + "grad_norm": 0.3925589621067047, + "learning_rate": 0.00010433551937091183, + "loss": 0.0357, + "step": 3685 + }, + { + "epoch": 9.838323353293413, + "grad_norm": 0.6299968957901001, + "learning_rate": 0.00010412415368646673, + "loss": 0.0429, + "step": 3690 + }, + { + "epoch": 9.851630073186959, + "grad_norm": 0.003008866449818015, + "learning_rate": 0.00010391276954415444, + "loss": 0.2409, + "step": 3695 + }, + { + "epoch": 9.864936793080506, + "grad_norm": 33.798118591308594, + "learning_rate": 0.00010370136789003582, + "loss": 0.2173, + "step": 3700 + }, + { + "epoch": 9.878243512974052, + "grad_norm": 2.2869341373443604, + "learning_rate": 0.00010348994967025012, + "loss": 0.1955, + "step": 3705 + }, + { + "epoch": 9.891550232867598, + "grad_norm": 0.3005884885787964, + "learning_rate": 0.00010327851583101071, + "loss": 0.0804, + "step": 3710 + }, + { + "epoch": 9.904856952761145, + "grad_norm": 0.15614235401153564, + "learning_rate": 0.0001030670673186009, + "loss": 0.0821, + "step": 3715 + }, + { + "epoch": 9.918163672654691, + "grad_norm": 24.48946762084961, + "learning_rate": 0.00010285560507936961, + "loss": 0.0234, + "step": 3720 + }, + { + "epoch": 9.931470392548237, + "grad_norm": 0.33835986256599426, + "learning_rate": 0.00010264413005972735, + "loss": 0.0957, + "step": 3725 + }, + { + "epoch": 9.944777112441782, + "grad_norm": 64.79618835449219, + "learning_rate": 0.00010243264320614157, + "loss": 0.0583, + "step": 3730 + }, + { + "epoch": 9.95808383233533, + "grad_norm": 0.7911556959152222, + "learning_rate": 0.00010222114546513295, + "loss": 0.0557, + "step": 3735 + }, + { + "epoch": 9.971390552228875, + "grad_norm": 0.6701344847679138, + "learning_rate": 0.0001020096377832707, + "loss": 0.153, + "step": 3740 + }, + { + "epoch": 9.984697272122421, + "grad_norm": 1.700467586517334, + "learning_rate": 0.00010179812110716864, + "loss": 0.1336, + "step": 3745 + }, + { + "epoch": 9.998003992015969, + "grad_norm": 0.4652709364891052, + "learning_rate": 0.00010158659638348081, + "loss": 0.036, + "step": 3750 + }, + { + "epoch": 9.998003992015969, + "eval_loss": 1.1191763877868652, + "eval_macro_f1": 67.80213350209692, + "eval_macro_precision": 71.58253936792221, + "eval_macro_recall": 65.93726965528465, + "eval_micro_f1": 87.5374251497006, + "eval_micro_precision": 87.5374251497006, + "eval_micro_recall": 87.5374251497006, + "eval_runtime": 5.0299, + "eval_samples_per_second": 531.228, + "eval_steps_per_second": 33.202, + "step": 3750 + }, + { + "epoch": 10.013306719893546, + "grad_norm": 0.45541515946388245, + "learning_rate": 0.00010137506455889721, + "loss": 0.0613, + "step": 3755 + }, + { + "epoch": 10.026613439787093, + "grad_norm": 0.1207580417394638, + "learning_rate": 0.00010116352658013973, + "loss": 0.0425, + "step": 3760 + }, + { + "epoch": 10.039920159680639, + "grad_norm": 0.4036962389945984, + "learning_rate": 0.00010095198339395769, + "loss": 0.0186, + "step": 3765 + }, + { + "epoch": 10.053226879574185, + "grad_norm": 0.08803912997245789, + "learning_rate": 0.00010074043594712379, + "loss": 0.0239, + "step": 3770 + }, + { + "epoch": 10.066533599467732, + "grad_norm": 0.4955994188785553, + "learning_rate": 0.00010052888518642978, + "loss": 0.0191, + "step": 3775 + }, + { + "epoch": 10.079840319361278, + "grad_norm": 0.512108564376831, + "learning_rate": 0.00010031733205868224, + "loss": 0.0547, + "step": 3780 + }, + { + "epoch": 10.093147039254823, + "grad_norm": 0.367607057094574, + "learning_rate": 0.00010010577751069838, + "loss": 0.0353, + "step": 3785 + }, + { + "epoch": 10.10645375914837, + "grad_norm": 0.5691599249839783, + "learning_rate": 9.989422248930167e-05, + "loss": 0.0286, + "step": 3790 + }, + { + "epoch": 10.119760479041917, + "grad_norm": 0.006430297624319792, + "learning_rate": 9.968266794131777e-05, + "loss": 0.0087, + "step": 3795 + }, + { + "epoch": 10.133067198935462, + "grad_norm": 0.43629810214042664, + "learning_rate": 9.947111481357021e-05, + "loss": 0.0256, + "step": 3800 + }, + { + "epoch": 10.146373918829008, + "grad_norm": 0.5085962414741516, + "learning_rate": 9.925956405287624e-05, + "loss": 0.0159, + "step": 3805 + }, + { + "epoch": 10.159680638722556, + "grad_norm": 5.319223880767822, + "learning_rate": 9.904801660604234e-05, + "loss": 0.0566, + "step": 3810 + }, + { + "epoch": 10.172987358616101, + "grad_norm": 0.47697970271110535, + "learning_rate": 9.883647341986032e-05, + "loss": 0.1049, + "step": 3815 + }, + { + "epoch": 10.186294078509647, + "grad_norm": 25.805103302001953, + "learning_rate": 9.862493544110282e-05, + "loss": 0.2494, + "step": 3820 + }, + { + "epoch": 10.199600798403194, + "grad_norm": 0.004839545115828514, + "learning_rate": 9.84134036165192e-05, + "loss": 0.0705, + "step": 3825 + }, + { + "epoch": 10.21290751829674, + "grad_norm": 29.43570899963379, + "learning_rate": 9.820187889283137e-05, + "loss": 0.2269, + "step": 3830 + }, + { + "epoch": 10.226214238190286, + "grad_norm": 0.4234377145767212, + "learning_rate": 9.799036221672931e-05, + "loss": 0.055, + "step": 3835 + }, + { + "epoch": 10.239520958083832, + "grad_norm": 0.11629313975572586, + "learning_rate": 9.777885453486706e-05, + "loss": 0.0419, + "step": 3840 + }, + { + "epoch": 10.252827677977379, + "grad_norm": 1.7031394243240356, + "learning_rate": 9.756735679385844e-05, + "loss": 0.2843, + "step": 3845 + }, + { + "epoch": 10.266134397870925, + "grad_norm": 0.0188790000975132, + "learning_rate": 9.735586994027267e-05, + "loss": 0.0727, + "step": 3850 + }, + { + "epoch": 10.27944111776447, + "grad_norm": 0.007724573370069265, + "learning_rate": 9.71443949206304e-05, + "loss": 0.0565, + "step": 3855 + }, + { + "epoch": 10.292747837658018, + "grad_norm": 0.6192372441291809, + "learning_rate": 9.693293268139914e-05, + "loss": 0.0634, + "step": 3860 + }, + { + "epoch": 10.306054557551564, + "grad_norm": 0.005998663604259491, + "learning_rate": 9.672148416898932e-05, + "loss": 0.0553, + "step": 3865 + }, + { + "epoch": 10.31936127744511, + "grad_norm": 0.7205235362052917, + "learning_rate": 9.651005032974994e-05, + "loss": 0.012, + "step": 3870 + }, + { + "epoch": 10.332667997338657, + "grad_norm": 0.14316412806510925, + "learning_rate": 9.629863210996419e-05, + "loss": 0.0101, + "step": 3875 + }, + { + "epoch": 10.345974717232203, + "grad_norm": 0.015486049465835094, + "learning_rate": 9.608723045584557e-05, + "loss": 0.0649, + "step": 3880 + }, + { + "epoch": 10.359281437125748, + "grad_norm": 1.8815991878509521, + "learning_rate": 9.587584631353329e-05, + "loss": 0.1858, + "step": 3885 + }, + { + "epoch": 10.372588157019294, + "grad_norm": 0.5233331918716431, + "learning_rate": 9.566448062908819e-05, + "loss": 0.0641, + "step": 3890 + }, + { + "epoch": 10.385894876912841, + "grad_norm": 0.5349389910697937, + "learning_rate": 9.545313434848856e-05, + "loss": 0.0432, + "step": 3895 + }, + { + "epoch": 10.399201596806387, + "grad_norm": 0.001954582752659917, + "learning_rate": 9.524180841762577e-05, + "loss": 0.0079, + "step": 3900 + }, + { + "epoch": 10.412508316699933, + "grad_norm": 0.19347253441810608, + "learning_rate": 9.503050378230022e-05, + "loss": 0.1334, + "step": 3905 + }, + { + "epoch": 10.42581503659348, + "grad_norm": 41.07929611206055, + "learning_rate": 9.481922138821696e-05, + "loss": 0.1135, + "step": 3910 + }, + { + "epoch": 10.439121756487026, + "grad_norm": 0.6698688268661499, + "learning_rate": 9.460796218098143e-05, + "loss": 0.0256, + "step": 3915 + }, + { + "epoch": 10.452428476380572, + "grad_norm": 0.2920777201652527, + "learning_rate": 9.439672710609531e-05, + "loss": 0.0303, + "step": 3920 + }, + { + "epoch": 10.46573519627412, + "grad_norm": 0.0010179243981838226, + "learning_rate": 9.418551710895243e-05, + "loss": 0.0186, + "step": 3925 + }, + { + "epoch": 10.479041916167665, + "grad_norm": 29.91852569580078, + "learning_rate": 9.397433313483416e-05, + "loss": 0.2275, + "step": 3930 + }, + { + "epoch": 10.49234863606121, + "grad_norm": 5.829182147979736, + "learning_rate": 9.376317612890556e-05, + "loss": 0.2251, + "step": 3935 + }, + { + "epoch": 10.505655355954758, + "grad_norm": 0.5852800011634827, + "learning_rate": 9.355204703621093e-05, + "loss": 0.1962, + "step": 3940 + }, + { + "epoch": 10.518962075848304, + "grad_norm": 0.12783049046993256, + "learning_rate": 9.334094680166962e-05, + "loss": 0.0286, + "step": 3945 + }, + { + "epoch": 10.53226879574185, + "grad_norm": 0.3511544167995453, + "learning_rate": 9.312987637007191e-05, + "loss": 0.0544, + "step": 3950 + }, + { + "epoch": 10.545575515635395, + "grad_norm": 0.4987472891807556, + "learning_rate": 9.291883668607461e-05, + "loss": 0.079, + "step": 3955 + }, + { + "epoch": 10.558882235528943, + "grad_norm": 0.30509260296821594, + "learning_rate": 9.270782869419694e-05, + "loss": 0.0875, + "step": 3960 + }, + { + "epoch": 10.572188955422488, + "grad_norm": 0.721363365650177, + "learning_rate": 9.249685333881633e-05, + "loss": 0.029, + "step": 3965 + }, + { + "epoch": 10.585495675316034, + "grad_norm": 0.7444695234298706, + "learning_rate": 9.228591156416404e-05, + "loss": 0.0402, + "step": 3970 + }, + { + "epoch": 10.598802395209582, + "grad_norm": 13.542842864990234, + "learning_rate": 9.207500431432115e-05, + "loss": 0.1159, + "step": 3975 + }, + { + "epoch": 10.612109115103127, + "grad_norm": 0.2932852506637573, + "learning_rate": 9.186413253321418e-05, + "loss": 0.0927, + "step": 3980 + }, + { + "epoch": 10.625415834996673, + "grad_norm": 1.3260793685913086, + "learning_rate": 9.165329716461087e-05, + "loss": 0.0972, + "step": 3985 + }, + { + "epoch": 10.638722554890219, + "grad_norm": 0.11311087757349014, + "learning_rate": 9.144249915211605e-05, + "loss": 0.0425, + "step": 3990 + }, + { + "epoch": 10.652029274783766, + "grad_norm": 0.4719153642654419, + "learning_rate": 9.123173943916728e-05, + "loss": 0.0714, + "step": 3995 + }, + { + "epoch": 10.665335994677312, + "grad_norm": 19.49868392944336, + "learning_rate": 9.102101896903084e-05, + "loss": 0.0478, + "step": 4000 + }, + { + "epoch": 10.678642714570858, + "grad_norm": 3.4599437713623047, + "learning_rate": 9.081033868479727e-05, + "loss": 0.0556, + "step": 4005 + }, + { + "epoch": 10.691949434464405, + "grad_norm": 4.2978620529174805, + "learning_rate": 9.059969952937727e-05, + "loss": 0.1521, + "step": 4010 + }, + { + "epoch": 10.70525615435795, + "grad_norm": 0.7445971965789795, + "learning_rate": 9.038910244549747e-05, + "loss": 0.0861, + "step": 4015 + }, + { + "epoch": 10.718562874251496, + "grad_norm": 0.7097308039665222, + "learning_rate": 9.01785483756963e-05, + "loss": 0.0422, + "step": 4020 + }, + { + "epoch": 10.731869594145044, + "grad_norm": 0.5130826830863953, + "learning_rate": 8.99680382623195e-05, + "loss": 0.1419, + "step": 4025 + }, + { + "epoch": 10.74517631403859, + "grad_norm": 0.006258231122046709, + "learning_rate": 8.975757304751626e-05, + "loss": 0.0277, + "step": 4030 + }, + { + "epoch": 10.758483033932135, + "grad_norm": 1.6164250373840332, + "learning_rate": 8.954715367323468e-05, + "loss": 0.1055, + "step": 4035 + }, + { + "epoch": 10.771789753825683, + "grad_norm": 0.5906782746315002, + "learning_rate": 8.933678108121774e-05, + "loss": 0.0427, + "step": 4040 + }, + { + "epoch": 10.785096473719229, + "grad_norm": 1.2181246280670166, + "learning_rate": 8.912645621299918e-05, + "loss": 0.0399, + "step": 4045 + }, + { + "epoch": 10.798403193612774, + "grad_norm": 0.004639142192900181, + "learning_rate": 8.891618000989891e-05, + "loss": 0.234, + "step": 4050 + }, + { + "epoch": 10.81170991350632, + "grad_norm": 0.5142450332641602, + "learning_rate": 8.870595341301921e-05, + "loss": 0.092, + "step": 4055 + }, + { + "epoch": 10.825016633399867, + "grad_norm": 0.17664307355880737, + "learning_rate": 8.849577736324033e-05, + "loss": 0.1276, + "step": 4060 + }, + { + "epoch": 10.838323353293413, + "grad_norm": 0.006000434514135122, + "learning_rate": 8.828565280121617e-05, + "loss": 0.0729, + "step": 4065 + }, + { + "epoch": 10.851630073186959, + "grad_norm": 0.4471026062965393, + "learning_rate": 8.807558066737041e-05, + "loss": 0.0274, + "step": 4070 + }, + { + "epoch": 10.864936793080506, + "grad_norm": 0.03893847018480301, + "learning_rate": 8.786556190189182e-05, + "loss": 0.108, + "step": 4075 + }, + { + "epoch": 10.878243512974052, + "grad_norm": 0.004344575107097626, + "learning_rate": 8.765559744473053e-05, + "loss": 0.0387, + "step": 4080 + }, + { + "epoch": 10.891550232867598, + "grad_norm": 1.100684404373169, + "learning_rate": 8.744568823559356e-05, + "loss": 0.0963, + "step": 4085 + }, + { + "epoch": 10.904856952761145, + "grad_norm": 2.4654958248138428, + "learning_rate": 8.723583521394054e-05, + "loss": 0.0638, + "step": 4090 + }, + { + "epoch": 10.918163672654691, + "grad_norm": 0.667192280292511, + "learning_rate": 8.702603931897982e-05, + "loss": 0.0457, + "step": 4095 + }, + { + "epoch": 10.931470392548237, + "grad_norm": 0.021668490022420883, + "learning_rate": 8.681630148966398e-05, + "loss": 0.001, + "step": 4100 + }, + { + "epoch": 10.944777112441782, + "grad_norm": 0.06268395483493805, + "learning_rate": 8.660662266468569e-05, + "loss": 0.0323, + "step": 4105 + }, + { + "epoch": 10.95808383233533, + "grad_norm": 0.6293454766273499, + "learning_rate": 8.639700378247361e-05, + "loss": 0.0573, + "step": 4110 + }, + { + "epoch": 10.971390552228875, + "grad_norm": 0.6691858768463135, + "learning_rate": 8.618744578118805e-05, + "loss": 0.0407, + "step": 4115 + }, + { + "epoch": 10.984697272122421, + "grad_norm": 0.014253449626266956, + "learning_rate": 8.597794959871694e-05, + "loss": 0.0089, + "step": 4120 + }, + { + "epoch": 10.998003992015969, + "grad_norm": 0.2566983997821808, + "learning_rate": 8.57685161726715e-05, + "loss": 0.0748, + "step": 4125 + }, + { + "epoch": 10.998003992015969, + "eval_loss": 1.299912929534912, + "eval_macro_f1": 67.6266109032924, + "eval_macro_precision": 70.40109716496542, + "eval_macro_recall": 66.54435860863018, + "eval_micro_f1": 87.3502994011976, + "eval_micro_precision": 87.3502994011976, + "eval_micro_recall": 87.3502994011976, + "eval_runtime": 4.9792, + "eval_samples_per_second": 536.634, + "eval_steps_per_second": 33.54, + "step": 4125 + }, + { + "epoch": 11.013306719893546, + "grad_norm": 0.03808118775486946, + "learning_rate": 8.555914644038202e-05, + "loss": 0.0191, + "step": 4130 + }, + { + "epoch": 11.026613439787093, + "grad_norm": 0.5897039771080017, + "learning_rate": 8.534984133889376e-05, + "loss": 0.0351, + "step": 4135 + }, + { + "epoch": 11.039920159680639, + "grad_norm": 0.0037707712035626173, + "learning_rate": 8.514060180496285e-05, + "loss": 0.0019, + "step": 4140 + }, + { + "epoch": 11.053226879574185, + "grad_norm": 0.00108764145988971, + "learning_rate": 8.49314287750517e-05, + "loss": 0.0073, + "step": 4145 + }, + { + "epoch": 11.066533599467732, + "grad_norm": 0.5410427451133728, + "learning_rate": 8.47223231853253e-05, + "loss": 0.0322, + "step": 4150 + }, + { + "epoch": 11.079840319361278, + "grad_norm": 0.6569406390190125, + "learning_rate": 8.451328597164679e-05, + "loss": 0.045, + "step": 4155 + }, + { + "epoch": 11.093147039254823, + "grad_norm": 0.0010824741329997778, + "learning_rate": 8.43043180695731e-05, + "loss": 0.0765, + "step": 4160 + }, + { + "epoch": 11.10645375914837, + "grad_norm": 0.2378658503293991, + "learning_rate": 8.409542041435128e-05, + "loss": 0.0524, + "step": 4165 + }, + { + "epoch": 11.119760479041917, + "grad_norm": 0.3880664110183716, + "learning_rate": 8.38865939409136e-05, + "loss": 0.0621, + "step": 4170 + }, + { + "epoch": 11.133067198935462, + "grad_norm": 0.7984123826026917, + "learning_rate": 8.367783958387407e-05, + "loss": 0.0756, + "step": 4175 + }, + { + "epoch": 11.146373918829008, + "grad_norm": 0.3829672634601593, + "learning_rate": 8.346915827752382e-05, + "loss": 0.0669, + "step": 4180 + }, + { + "epoch": 11.159680638722556, + "grad_norm": 0.002959012985229492, + "learning_rate": 8.326055095582694e-05, + "loss": 0.0262, + "step": 4185 + }, + { + "epoch": 11.172987358616101, + "grad_norm": 0.45035097002983093, + "learning_rate": 8.305201855241659e-05, + "loss": 0.0263, + "step": 4190 + }, + { + "epoch": 11.186294078509647, + "grad_norm": 0.002182081574574113, + "learning_rate": 8.284356200059051e-05, + "loss": 0.044, + "step": 4195 + }, + { + "epoch": 11.199600798403194, + "grad_norm": 0.0009778572712093592, + "learning_rate": 8.263518223330697e-05, + "loss": 0.0278, + "step": 4200 + }, + { + "epoch": 11.21290751829674, + "grad_norm": 0.0005438898224383593, + "learning_rate": 8.242688018318063e-05, + "loss": 0.0462, + "step": 4205 + }, + { + "epoch": 11.226214238190286, + "grad_norm": 0.0006932623218744993, + "learning_rate": 8.221865678247821e-05, + "loss": 0.0438, + "step": 4210 + }, + { + "epoch": 11.239520958083832, + "grad_norm": 0.5347298383712769, + "learning_rate": 8.201051296311462e-05, + "loss": 0.0421, + "step": 4215 + }, + { + "epoch": 11.252827677977379, + "grad_norm": 0.5404713749885559, + "learning_rate": 8.180244965664845e-05, + "loss": 0.0566, + "step": 4220 + }, + { + "epoch": 11.266134397870925, + "grad_norm": 0.9088073968887329, + "learning_rate": 8.159446779427797e-05, + "loss": 0.0362, + "step": 4225 + }, + { + "epoch": 11.27944111776447, + "grad_norm": 0.4720342457294464, + "learning_rate": 8.1386568306837e-05, + "loss": 0.1158, + "step": 4230 + }, + { + "epoch": 11.292747837658018, + "grad_norm": 47.885772705078125, + "learning_rate": 8.117875212479069e-05, + "loss": 0.0529, + "step": 4235 + }, + { + "epoch": 11.306054557551564, + "grad_norm": 0.6843175888061523, + "learning_rate": 8.09710201782312e-05, + "loss": 0.0287, + "step": 4240 + }, + { + "epoch": 11.31936127744511, + "grad_norm": 1.8459924459457397, + "learning_rate": 8.076337339687394e-05, + "loss": 0.0635, + "step": 4245 + }, + { + "epoch": 11.332667997338657, + "grad_norm": 3.7261202335357666, + "learning_rate": 8.055581271005292e-05, + "loss": 0.0271, + "step": 4250 + }, + { + "epoch": 11.345974717232203, + "grad_norm": 0.002110959030687809, + "learning_rate": 8.034833904671698e-05, + "loss": 0.0213, + "step": 4255 + }, + { + "epoch": 11.359281437125748, + "grad_norm": 0.783439040184021, + "learning_rate": 8.014095333542548e-05, + "loss": 0.157, + "step": 4260 + }, + { + "epoch": 11.372588157019294, + "grad_norm": 0.3574387729167938, + "learning_rate": 7.9933656504344e-05, + "loss": 0.0669, + "step": 4265 + }, + { + "epoch": 11.385894876912841, + "grad_norm": 0.5715398788452148, + "learning_rate": 7.97264494812405e-05, + "loss": 0.0349, + "step": 4270 + }, + { + "epoch": 11.399201596806387, + "grad_norm": 0.6373995542526245, + "learning_rate": 7.951933319348095e-05, + "loss": 0.0257, + "step": 4275 + }, + { + "epoch": 11.412508316699933, + "grad_norm": 2.8598508834838867, + "learning_rate": 7.931230856802514e-05, + "loss": 0.1056, + "step": 4280 + }, + { + "epoch": 11.42581503659348, + "grad_norm": 0.3895857334136963, + "learning_rate": 7.91053765314228e-05, + "loss": 0.047, + "step": 4285 + }, + { + "epoch": 11.439121756487026, + "grad_norm": 0.00783681869506836, + "learning_rate": 7.889853800980904e-05, + "loss": 0.0344, + "step": 4290 + }, + { + "epoch": 11.452428476380572, + "grad_norm": 0.6813995838165283, + "learning_rate": 7.869179392890065e-05, + "loss": 0.0847, + "step": 4295 + }, + { + "epoch": 11.46573519627412, + "grad_norm": 0.03324246034026146, + "learning_rate": 7.848514521399166e-05, + "loss": 0.0171, + "step": 4300 + }, + { + "epoch": 11.479041916167665, + "grad_norm": 0.012428533285856247, + "learning_rate": 7.827859278994925e-05, + "loss": 0.0697, + "step": 4305 + }, + { + "epoch": 11.49234863606121, + "grad_norm": 0.02753024362027645, + "learning_rate": 7.807213758120966e-05, + "loss": 0.037, + "step": 4310 + }, + { + "epoch": 11.505655355954758, + "grad_norm": 0.28044065833091736, + "learning_rate": 7.786578051177415e-05, + "loss": 0.0139, + "step": 4315 + }, + { + "epoch": 11.518962075848304, + "grad_norm": 0.004422112833708525, + "learning_rate": 7.765952250520459e-05, + "loss": 0.0564, + "step": 4320 + }, + { + "epoch": 11.53226879574185, + "grad_norm": 0.5593070387840271, + "learning_rate": 7.745336448461959e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 11.545575515635395, + "grad_norm": 1.2178239822387695, + "learning_rate": 7.72473073726902e-05, + "loss": 0.0725, + "step": 4330 + }, + { + "epoch": 11.558882235528943, + "grad_norm": 0.47392886877059937, + "learning_rate": 7.704135209163589e-05, + "loss": 0.0196, + "step": 4335 + }, + { + "epoch": 11.572188955422488, + "grad_norm": 0.4012060761451721, + "learning_rate": 7.683549956322043e-05, + "loss": 0.0067, + "step": 4340 + }, + { + "epoch": 11.585495675316034, + "grad_norm": 0.3064373731613159, + "learning_rate": 7.662975070874761e-05, + "loss": 0.0347, + "step": 4345 + }, + { + "epoch": 11.598802395209582, + "grad_norm": 0.0003016276750713587, + "learning_rate": 7.642410644905726e-05, + "loss": 0.0082, + "step": 4350 + }, + { + "epoch": 11.612109115103127, + "grad_norm": 0.6510899662971497, + "learning_rate": 7.62185677045212e-05, + "loss": 0.0775, + "step": 4355 + }, + { + "epoch": 11.625415834996673, + "grad_norm": 0.44786661863327026, + "learning_rate": 7.601313539503879e-05, + "loss": 0.0881, + "step": 4360 + }, + { + "epoch": 11.638722554890219, + "grad_norm": 0.16136354207992554, + "learning_rate": 7.580781044003324e-05, + "loss": 0.0321, + "step": 4365 + }, + { + "epoch": 11.652029274783766, + "grad_norm": 0.0006294685881584883, + "learning_rate": 7.560259375844718e-05, + "loss": 0.1577, + "step": 4370 + }, + { + "epoch": 11.665335994677312, + "grad_norm": 0.0005464013665914536, + "learning_rate": 7.539748626873866e-05, + "loss": 0.0069, + "step": 4375 + }, + { + "epoch": 11.678642714570858, + "grad_norm": 0.7603722810745239, + "learning_rate": 7.519248888887716e-05, + "loss": 0.0595, + "step": 4380 + }, + { + "epoch": 11.691949434464405, + "grad_norm": 43.034149169921875, + "learning_rate": 7.498760253633909e-05, + "loss": 0.1499, + "step": 4385 + }, + { + "epoch": 11.70525615435795, + "grad_norm": 0.6510688066482544, + "learning_rate": 7.478282812810428e-05, + "loss": 0.0514, + "step": 4390 + }, + { + "epoch": 11.718562874251496, + "grad_norm": 0.516620397567749, + "learning_rate": 7.457816658065134e-05, + "loss": 0.047, + "step": 4395 + }, + { + "epoch": 11.731869594145044, + "grad_norm": 0.039791446179151535, + "learning_rate": 7.437361880995375e-05, + "loss": 0.0415, + "step": 4400 + }, + { + "epoch": 11.74517631403859, + "grad_norm": 0.0013697980903089046, + "learning_rate": 7.416918573147588e-05, + "loss": 0.0077, + "step": 4405 + }, + { + "epoch": 11.758483033932135, + "grad_norm": 0.24411720037460327, + "learning_rate": 7.39648682601688e-05, + "loss": 0.0435, + "step": 4410 + }, + { + "epoch": 11.771789753825683, + "grad_norm": 0.012387442402541637, + "learning_rate": 7.376066731046605e-05, + "loss": 0.0466, + "step": 4415 + }, + { + "epoch": 11.785096473719229, + "grad_norm": 0.5403460264205933, + "learning_rate": 7.35565837962798e-05, + "loss": 0.112, + "step": 4420 + }, + { + "epoch": 11.798403193612774, + "grad_norm": 0.0008433172479271889, + "learning_rate": 7.335261863099651e-05, + "loss": 0.0437, + "step": 4425 + }, + { + "epoch": 11.81170991350632, + "grad_norm": 0.009126711636781693, + "learning_rate": 7.314877272747306e-05, + "loss": 0.0739, + "step": 4430 + }, + { + "epoch": 11.825016633399867, + "grad_norm": 0.0014366311952471733, + "learning_rate": 7.294504699803257e-05, + "loss": 0.0525, + "step": 4435 + }, + { + "epoch": 11.838323353293413, + "grad_norm": 0.8768602609634399, + "learning_rate": 7.274144235446023e-05, + "loss": 0.0247, + "step": 4440 + }, + { + "epoch": 11.851630073186959, + "grad_norm": 0.8969868421554565, + "learning_rate": 7.253795970799935e-05, + "loss": 0.0722, + "step": 4445 + }, + { + "epoch": 11.864936793080506, + "grad_norm": 3.381582498550415, + "learning_rate": 7.23345999693473e-05, + "loss": 0.0392, + "step": 4450 + }, + { + "epoch": 11.878243512974052, + "grad_norm": 0.00464739790186286, + "learning_rate": 7.213136404865124e-05, + "loss": 0.0479, + "step": 4455 + }, + { + "epoch": 11.891550232867598, + "grad_norm": 0.4277834892272949, + "learning_rate": 7.192825285550431e-05, + "loss": 0.0691, + "step": 4460 + }, + { + "epoch": 11.904856952761145, + "grad_norm": 1.2095205783843994, + "learning_rate": 7.172526729894129e-05, + "loss": 0.0572, + "step": 4465 + }, + { + "epoch": 11.918163672654691, + "grad_norm": 0.0006847070762887597, + "learning_rate": 7.152240828743477e-05, + "loss": 0.0374, + "step": 4470 + }, + { + "epoch": 11.931470392548237, + "grad_norm": 0.4241344630718231, + "learning_rate": 7.131967672889101e-05, + "loss": 0.0469, + "step": 4475 + }, + { + "epoch": 11.944777112441782, + "grad_norm": 0.0002920299884863198, + "learning_rate": 7.111707353064565e-05, + "loss": 0.0809, + "step": 4480 + }, + { + "epoch": 11.95808383233533, + "grad_norm": 0.0005238928133621812, + "learning_rate": 7.09145995994601e-05, + "loss": 0.0426, + "step": 4485 + }, + { + "epoch": 11.971390552228875, + "grad_norm": 0.0008192436653189361, + "learning_rate": 7.071225584151708e-05, + "loss": 0.037, + "step": 4490 + }, + { + "epoch": 11.984697272122421, + "grad_norm": 0.0003719358937814832, + "learning_rate": 7.051004316241672e-05, + "loss": 0.0411, + "step": 4495 + }, + { + "epoch": 11.998003992015969, + "grad_norm": 0.4025455713272095, + "learning_rate": 7.030796246717255e-05, + "loss": 0.0644, + "step": 4500 + }, + { + "epoch": 11.998003992015969, + "eval_loss": 1.445887804031372, + "eval_macro_f1": 67.62150339419195, + "eval_macro_precision": 71.442956060202, + "eval_macro_recall": 65.1796170254859, + "eval_micro_f1": 87.5374251497006, + "eval_micro_precision": 87.5374251497006, + "eval_micro_recall": 87.5374251497006, + "eval_runtime": 6.051, + "eval_samples_per_second": 441.581, + "eval_steps_per_second": 27.599, + "step": 4500 + }, + { + "epoch": 12.013306719893546, + "grad_norm": 0.4702771008014679, + "learning_rate": 7.010601466020732e-05, + "loss": 0.0361, + "step": 4505 + }, + { + "epoch": 12.026613439787093, + "grad_norm": 0.32089468836784363, + "learning_rate": 6.990420064534914e-05, + "loss": 0.0383, + "step": 4510 + }, + { + "epoch": 12.039920159680639, + "grad_norm": 0.1299007534980774, + "learning_rate": 6.970252132582728e-05, + "loss": 0.061, + "step": 4515 + }, + { + "epoch": 12.053226879574185, + "grad_norm": 0.5304166078567505, + "learning_rate": 6.950097760426814e-05, + "loss": 0.0288, + "step": 4520 + }, + { + "epoch": 12.066533599467732, + "grad_norm": 0.3846903443336487, + "learning_rate": 6.929957038269122e-05, + "loss": 0.0377, + "step": 4525 + }, + { + "epoch": 12.079840319361278, + "grad_norm": 0.001077175373211503, + "learning_rate": 6.909830056250527e-05, + "loss": 0.1997, + "step": 4530 + }, + { + "epoch": 12.093147039254823, + "grad_norm": 0.31744185090065, + "learning_rate": 6.889716904450387e-05, + "loss": 0.0564, + "step": 4535 + }, + { + "epoch": 12.10645375914837, + "grad_norm": 1.2166818380355835, + "learning_rate": 6.869617672886182e-05, + "loss": 0.0387, + "step": 4540 + }, + { + "epoch": 12.119760479041917, + "grad_norm": 0.4173411428928375, + "learning_rate": 6.849532451513074e-05, + "loss": 0.0349, + "step": 4545 + }, + { + "epoch": 12.133067198935462, + "grad_norm": 1.1611005067825317, + "learning_rate": 6.829461330223533e-05, + "loss": 0.0532, + "step": 4550 + }, + { + "epoch": 12.146373918829008, + "grad_norm": 0.9703657627105713, + "learning_rate": 6.809404398846921e-05, + "loss": 0.0657, + "step": 4555 + }, + { + "epoch": 12.159680638722556, + "grad_norm": 0.5941152572631836, + "learning_rate": 6.789361747149093e-05, + "loss": 0.0221, + "step": 4560 + }, + { + "epoch": 12.172987358616101, + "grad_norm": 0.5168139934539795, + "learning_rate": 6.769333464831991e-05, + "loss": 0.0487, + "step": 4565 + }, + { + "epoch": 12.186294078509647, + "grad_norm": 0.01413114182651043, + "learning_rate": 6.74931964153325e-05, + "loss": 0.0097, + "step": 4570 + }, + { + "epoch": 12.199600798403194, + "grad_norm": 0.00606010714545846, + "learning_rate": 6.729320366825784e-05, + "loss": 0.0238, + "step": 4575 + }, + { + "epoch": 12.21290751829674, + "grad_norm": 0.9490993022918701, + "learning_rate": 6.709335730217412e-05, + "loss": 0.0697, + "step": 4580 + }, + { + "epoch": 12.226214238190286, + "grad_norm": 0.00019976045587100089, + "learning_rate": 6.68936582115042e-05, + "loss": 0.0, + "step": 4585 + }, + { + "epoch": 12.239520958083832, + "grad_norm": 0.4100409150123596, + "learning_rate": 6.669410729001193e-05, + "loss": 0.0244, + "step": 4590 + }, + { + "epoch": 12.252827677977379, + "grad_norm": 0.21931087970733643, + "learning_rate": 6.649470543079798e-05, + "loss": 0.0459, + "step": 4595 + }, + { + "epoch": 12.266134397870925, + "grad_norm": 0.39032813906669617, + "learning_rate": 6.629545352629582e-05, + "loss": 0.028, + "step": 4600 + }, + { + "epoch": 12.27944111776447, + "grad_norm": 0.2530187964439392, + "learning_rate": 6.609635246826794e-05, + "loss": 0.011, + "step": 4605 + }, + { + "epoch": 12.292747837658018, + "grad_norm": 0.000762270821724087, + "learning_rate": 6.589740314780157e-05, + "loss": 0.0347, + "step": 4610 + }, + { + "epoch": 12.306054557551564, + "grad_norm": 0.05708887055516243, + "learning_rate": 6.569860645530487e-05, + "loss": 0.0406, + "step": 4615 + }, + { + "epoch": 12.31936127744511, + "grad_norm": 0.00042223682976327837, + "learning_rate": 6.549996328050296e-05, + "loss": 0.0387, + "step": 4620 + }, + { + "epoch": 12.332667997338657, + "grad_norm": 0.4647948741912842, + "learning_rate": 6.530147451243377e-05, + "loss": 0.0285, + "step": 4625 + }, + { + "epoch": 12.345974717232203, + "grad_norm": 0.94704669713974, + "learning_rate": 6.51031410394443e-05, + "loss": 0.0515, + "step": 4630 + }, + { + "epoch": 12.359281437125748, + "grad_norm": 0.0007556902128271759, + "learning_rate": 6.490496374918647e-05, + "loss": 0.0334, + "step": 4635 + }, + { + "epoch": 12.372588157019294, + "grad_norm": 0.5485817193984985, + "learning_rate": 6.470694352861312e-05, + "loss": 0.0679, + "step": 4640 + }, + { + "epoch": 12.385894876912841, + "grad_norm": 0.6135960817337036, + "learning_rate": 6.450908126397423e-05, + "loss": 0.0247, + "step": 4645 + }, + { + "epoch": 12.399201596806387, + "grad_norm": 0.5665143728256226, + "learning_rate": 6.431137784081282e-05, + "loss": 0.0321, + "step": 4650 + }, + { + "epoch": 12.412508316699933, + "grad_norm": 0.0015934518305584788, + "learning_rate": 6.411383414396095e-05, + "loss": 0.0294, + "step": 4655 + }, + { + "epoch": 12.42581503659348, + "grad_norm": 0.3874810039997101, + "learning_rate": 6.391645105753583e-05, + "loss": 0.0295, + "step": 4660 + }, + { + "epoch": 12.439121756487026, + "grad_norm": 0.000483459240058437, + "learning_rate": 6.371922946493591e-05, + "loss": 0.0359, + "step": 4665 + }, + { + "epoch": 12.452428476380572, + "grad_norm": 0.520750105381012, + "learning_rate": 6.352217024883678e-05, + "loss": 0.0393, + "step": 4670 + }, + { + "epoch": 12.46573519627412, + "grad_norm": 0.5864303112030029, + "learning_rate": 6.33252742911874e-05, + "loss": 0.003, + "step": 4675 + }, + { + "epoch": 12.479041916167665, + "grad_norm": 1.0039427280426025, + "learning_rate": 6.312854247320595e-05, + "loss": 0.1379, + "step": 4680 + }, + { + "epoch": 12.49234863606121, + "grad_norm": 0.006187811028212309, + "learning_rate": 6.293197567537605e-05, + "loss": 0.024, + "step": 4685 + }, + { + "epoch": 12.505655355954758, + "grad_norm": 0.615149974822998, + "learning_rate": 6.273557477744285e-05, + "loss": 0.0604, + "step": 4690 + }, + { + "epoch": 12.518962075848304, + "grad_norm": 0.006822746712714434, + "learning_rate": 6.25393406584088e-05, + "loss": 0.0376, + "step": 4695 + }, + { + "epoch": 12.53226879574185, + "grad_norm": 0.038054272532463074, + "learning_rate": 6.234327419653013e-05, + "loss": 0.0287, + "step": 4700 + }, + { + "epoch": 12.545575515635395, + "grad_norm": 0.0010650916956365108, + "learning_rate": 6.214737626931261e-05, + "loss": 0.0987, + "step": 4705 + }, + { + "epoch": 12.558882235528943, + "grad_norm": 1.0322421789169312, + "learning_rate": 6.19516477535077e-05, + "loss": 0.055, + "step": 4710 + }, + { + "epoch": 12.572188955422488, + "grad_norm": 0.006024849601089954, + "learning_rate": 6.175608952510874e-05, + "loss": 0.0016, + "step": 4715 + }, + { + "epoch": 12.585495675316034, + "grad_norm": 0.0465017631649971, + "learning_rate": 6.156070245934684e-05, + "loss": 0.0503, + "step": 4720 + }, + { + "epoch": 12.598802395209582, + "grad_norm": 0.2706870436668396, + "learning_rate": 6.136548743068713e-05, + "loss": 0.0268, + "step": 4725 + }, + { + "epoch": 12.612109115103127, + "grad_norm": 0.0009190389537252486, + "learning_rate": 6.117044531282481e-05, + "loss": 0.0928, + "step": 4730 + }, + { + "epoch": 12.625415834996673, + "grad_norm": 0.6977292895317078, + "learning_rate": 6.097557697868108e-05, + "loss": 0.0543, + "step": 4735 + }, + { + "epoch": 12.638722554890219, + "grad_norm": 1.7532901763916016, + "learning_rate": 6.078088330039945e-05, + "loss": 0.0318, + "step": 4740 + }, + { + "epoch": 12.652029274783766, + "grad_norm": 0.0035180707927793264, + "learning_rate": 6.0586365149341806e-05, + "loss": 0.0384, + "step": 4745 + }, + { + "epoch": 12.665335994677312, + "grad_norm": 0.2537969648838043, + "learning_rate": 6.039202339608432e-05, + "loss": 0.0376, + "step": 4750 + }, + { + "epoch": 12.678642714570858, + "grad_norm": 0.0006269970326684415, + "learning_rate": 6.019785891041381e-05, + "loss": 0.0568, + "step": 4755 + }, + { + "epoch": 12.691949434464405, + "grad_norm": 0.91341233253479, + "learning_rate": 6.0003872561323584e-05, + "loss": 0.0295, + "step": 4760 + }, + { + "epoch": 12.70525615435795, + "grad_norm": 0.3654066324234009, + "learning_rate": 5.9810065217009824e-05, + "loss": 0.0188, + "step": 4765 + }, + { + "epoch": 12.718562874251496, + "grad_norm": 0.0023132911883294582, + "learning_rate": 5.9616437744867535e-05, + "loss": 0.0302, + "step": 4770 + }, + { + "epoch": 12.731869594145044, + "grad_norm": 0.3889198303222656, + "learning_rate": 5.9422991011486626e-05, + "loss": 0.0503, + "step": 4775 + }, + { + "epoch": 12.74517631403859, + "grad_norm": 0.0006781243137083948, + "learning_rate": 5.922972588264818e-05, + "loss": 0.0005, + "step": 4780 + }, + { + "epoch": 12.758483033932135, + "grad_norm": 0.0015274898614734411, + "learning_rate": 5.9036643223320475e-05, + "loss": 0.029, + "step": 4785 + }, + { + "epoch": 12.771789753825683, + "grad_norm": 0.000553864287212491, + "learning_rate": 5.8843743897655077e-05, + "loss": 0.0619, + "step": 4790 + }, + { + "epoch": 12.785096473719229, + "grad_norm": 0.12268496304750443, + "learning_rate": 5.865102876898315e-05, + "loss": 0.0525, + "step": 4795 + }, + { + "epoch": 12.798403193612774, + "grad_norm": 0.5278739929199219, + "learning_rate": 5.845849869981137e-05, + "loss": 0.0105, + "step": 4800 + }, + { + "epoch": 12.81170991350632, + "grad_norm": 0.3189643621444702, + "learning_rate": 5.8266154551818216e-05, + "loss": 0.051, + "step": 4805 + }, + { + "epoch": 12.825016633399867, + "grad_norm": 0.5433342456817627, + "learning_rate": 5.807399718585009e-05, + "loss": 0.042, + "step": 4810 + }, + { + "epoch": 12.838323353293413, + "grad_norm": 0.943836510181427, + "learning_rate": 5.788202746191734e-05, + "loss": 0.0255, + "step": 4815 + }, + { + "epoch": 12.851630073186959, + "grad_norm": 0.1590261310338974, + "learning_rate": 5.769024623919064e-05, + "loss": 0.0127, + "step": 4820 + }, + { + "epoch": 12.864936793080506, + "grad_norm": 0.45719876885414124, + "learning_rate": 5.749865437599703e-05, + "loss": 0.0495, + "step": 4825 + }, + { + "epoch": 12.878243512974052, + "grad_norm": 0.031068850308656693, + "learning_rate": 5.7307252729815833e-05, + "loss": 0.0578, + "step": 4830 + }, + { + "epoch": 12.891550232867598, + "grad_norm": 0.00038968288572505116, + "learning_rate": 5.711604215727532e-05, + "loss": 0.0487, + "step": 4835 + }, + { + "epoch": 12.904856952761145, + "grad_norm": 0.028543902561068535, + "learning_rate": 5.6925023514148414e-05, + "loss": 0.0064, + "step": 4840 + }, + { + "epoch": 12.918163672654691, + "grad_norm": 0.000108074709714856, + "learning_rate": 5.6734197655349156e-05, + "loss": 0.0306, + "step": 4845 + }, + { + "epoch": 12.931470392548237, + "grad_norm": 0.0013804432237520814, + "learning_rate": 5.654356543492882e-05, + "loss": 0.0457, + "step": 4850 + }, + { + "epoch": 12.944777112441782, + "grad_norm": 0.00011231685493839905, + "learning_rate": 5.635312770607179e-05, + "loss": 0.0294, + "step": 4855 + }, + { + "epoch": 12.95808383233533, + "grad_norm": 0.45072370767593384, + "learning_rate": 5.616288532109225e-05, + "loss": 0.063, + "step": 4860 + }, + { + "epoch": 12.971390552228875, + "grad_norm": 0.6160675287246704, + "learning_rate": 5.597283913143006e-05, + "loss": 0.0382, + "step": 4865 + }, + { + "epoch": 12.984697272122421, + "grad_norm": 1.4277915954589844, + "learning_rate": 5.5782989987646896e-05, + "loss": 0.0962, + "step": 4870 + }, + { + "epoch": 12.998003992015969, + "grad_norm": 0.00010680627747206017, + "learning_rate": 5.559333873942259e-05, + "loss": 0.0201, + "step": 4875 + }, + { + "epoch": 12.998003992015969, + "eval_loss": 1.546578049659729, + "eval_macro_f1": 67.89414685540753, + "eval_macro_precision": 71.53678356588046, + "eval_macro_recall": 65.58376331013532, + "eval_micro_f1": 87.6497005988024, + "eval_micro_precision": 87.6497005988024, + "eval_micro_recall": 87.6497005988024, + "eval_runtime": 5.2905, + "eval_samples_per_second": 505.058, + "eval_steps_per_second": 31.566, + "step": 4875 + }, + { + "epoch": 13.013306719893546, + "grad_norm": 0.6065590381622314, + "learning_rate": 5.5403886235551374e-05, + "loss": 0.0549, + "step": 4880 + }, + { + "epoch": 13.026613439787093, + "grad_norm": 0.001139428117312491, + "learning_rate": 5.521463332393784e-05, + "loss": 0.0363, + "step": 4885 + }, + { + "epoch": 13.039920159680639, + "grad_norm": 4.570841701934114e-05, + "learning_rate": 5.5025580851593436e-05, + "loss": 0.0332, + "step": 4890 + }, + { + "epoch": 13.053226879574185, + "grad_norm": 0.013228015042841434, + "learning_rate": 5.483672966463245e-05, + "loss": 0.004, + "step": 4895 + }, + { + "epoch": 13.066533599467732, + "grad_norm": 0.8052898049354553, + "learning_rate": 5.4648080608268245e-05, + "loss": 0.0337, + "step": 4900 + }, + { + "epoch": 13.079840319361278, + "grad_norm": 0.652985692024231, + "learning_rate": 5.445963452680973e-05, + "loss": 0.0274, + "step": 4905 + }, + { + "epoch": 13.093147039254823, + "grad_norm": 0.8381507992744446, + "learning_rate": 5.427139226365718e-05, + "loss": 0.0304, + "step": 4910 + }, + { + "epoch": 13.10645375914837, + "grad_norm": 0.9545891284942627, + "learning_rate": 5.4083354661298814e-05, + "loss": 0.0852, + "step": 4915 + }, + { + "epoch": 13.119760479041917, + "grad_norm": 0.7544320821762085, + "learning_rate": 5.38955225613069e-05, + "loss": 0.0799, + "step": 4920 + }, + { + "epoch": 13.133067198935462, + "grad_norm": 0.5602716207504272, + "learning_rate": 5.3707896804333756e-05, + "loss": 0.0518, + "step": 4925 + }, + { + "epoch": 13.146373918829008, + "grad_norm": 0.823800265789032, + "learning_rate": 5.3520478230108464e-05, + "loss": 0.0438, + "step": 4930 + }, + { + "epoch": 13.159680638722556, + "grad_norm": 0.0064308131113648415, + "learning_rate": 5.333326767743263e-05, + "loss": 0.0392, + "step": 4935 + }, + { + "epoch": 13.172987358616101, + "grad_norm": 1.1453149318695068, + "learning_rate": 5.314626598417707e-05, + "loss": 0.0568, + "step": 4940 + }, + { + "epoch": 13.186294078509647, + "grad_norm": 0.00013595831114798784, + "learning_rate": 5.295947398727763e-05, + "loss": 0.0475, + "step": 4945 + }, + { + "epoch": 13.199600798403194, + "grad_norm": 0.4676132798194885, + "learning_rate": 5.277289252273174e-05, + "loss": 0.0579, + "step": 4950 + }, + { + "epoch": 13.21290751829674, + "grad_norm": 0.41389578580856323, + "learning_rate": 5.258652242559461e-05, + "loss": 0.0186, + "step": 4955 + }, + { + "epoch": 13.226214238190286, + "grad_norm": 0.5707477927207947, + "learning_rate": 5.2400364529975446e-05, + "loss": 0.0515, + "step": 4960 + }, + { + "epoch": 13.239520958083832, + "grad_norm": 5.477669037645683e-05, + "learning_rate": 5.221441966903371e-05, + "loss": 0.0461, + "step": 4965 + }, + { + "epoch": 13.252827677977379, + "grad_norm": 0.34287309646606445, + "learning_rate": 5.2028688674975415e-05, + "loss": 0.0395, + "step": 4970 + }, + { + "epoch": 13.266134397870925, + "grad_norm": 6.0640715673798695e-05, + "learning_rate": 5.184317237904939e-05, + "loss": 0.0, + "step": 4975 + }, + { + "epoch": 13.27944111776447, + "grad_norm": 0.3991676867008209, + "learning_rate": 5.1657871611543605e-05, + "loss": 0.009, + "step": 4980 + }, + { + "epoch": 13.292747837658018, + "grad_norm": 5.605097976513207e-05, + "learning_rate": 5.147278720178148e-05, + "loss": 0.0523, + "step": 4985 + }, + { + "epoch": 13.306054557551564, + "grad_norm": 0.42107030749320984, + "learning_rate": 5.128791997811799e-05, + "loss": 0.0158, + "step": 4990 + }, + { + "epoch": 13.31936127744511, + "grad_norm": 0.00029585917945951223, + "learning_rate": 5.110327076793613e-05, + "loss": 0.0316, + "step": 4995 + }, + { + "epoch": 13.332667997338657, + "grad_norm": 0.6536248326301575, + "learning_rate": 5.091884039764321e-05, + "loss": 0.0242, + "step": 5000 + }, + { + "epoch": 13.345974717232203, + "grad_norm": 0.00014492319314740598, + "learning_rate": 5.0734629692667e-05, + "loss": 0.0297, + "step": 5005 + }, + { + "epoch": 13.359281437125748, + "grad_norm": 0.9105699062347412, + "learning_rate": 5.055063947745233e-05, + "loss": 0.0343, + "step": 5010 + }, + { + "epoch": 13.372588157019294, + "grad_norm": 0.00011185540643054992, + "learning_rate": 5.036687057545704e-05, + "loss": 0.0215, + "step": 5015 + }, + { + "epoch": 13.385894876912841, + "grad_norm": 0.3092271685600281, + "learning_rate": 5.01833238091485e-05, + "loss": 0.0789, + "step": 5020 + }, + { + "epoch": 13.399201596806387, + "grad_norm": 0.3386293053627014, + "learning_rate": 5.000000000000002e-05, + "loss": 0.0668, + "step": 5025 + }, + { + "epoch": 13.412508316699933, + "grad_norm": 0.001697537605650723, + "learning_rate": 4.98168999684869e-05, + "loss": 0.0337, + "step": 5030 + }, + { + "epoch": 13.42581503659348, + "grad_norm": 0.39653119444847107, + "learning_rate": 4.9634024534083044e-05, + "loss": 0.012, + "step": 5035 + }, + { + "epoch": 13.439121756487026, + "grad_norm": 0.19999262690544128, + "learning_rate": 4.945137451525707e-05, + "loss": 0.0134, + "step": 5040 + }, + { + "epoch": 13.452428476380572, + "grad_norm": 0.2290111631155014, + "learning_rate": 4.926895072946871e-05, + "loss": 0.02, + "step": 5045 + }, + { + "epoch": 13.46573519627412, + "grad_norm": 0.39010921120643616, + "learning_rate": 4.908675399316534e-05, + "loss": 0.0259, + "step": 5050 + }, + { + "epoch": 13.479041916167665, + "grad_norm": 1.3865301609039307, + "learning_rate": 4.890478512177795e-05, + "loss": 0.0423, + "step": 5055 + }, + { + "epoch": 13.49234863606121, + "grad_norm": 0.7625669240951538, + "learning_rate": 4.8723044929717906e-05, + "loss": 0.0528, + "step": 5060 + }, + { + "epoch": 13.505655355954758, + "grad_norm": 0.45569607615470886, + "learning_rate": 4.8541534230372974e-05, + "loss": 0.0291, + "step": 5065 + }, + { + "epoch": 13.518962075848304, + "grad_norm": 8.144270395860076e-05, + "learning_rate": 4.836025383610382e-05, + "loss": 0.0211, + "step": 5070 + }, + { + "epoch": 13.53226879574185, + "grad_norm": 0.4941138029098511, + "learning_rate": 4.8179204558240444e-05, + "loss": 0.076, + "step": 5075 + }, + { + "epoch": 13.545575515635395, + "grad_norm": 0.008551147766411304, + "learning_rate": 4.799838720707846e-05, + "loss": 0.0175, + "step": 5080 + }, + { + "epoch": 13.558882235528943, + "grad_norm": 0.4467279314994812, + "learning_rate": 4.7817802591875426e-05, + "loss": 0.0756, + "step": 5085 + }, + { + "epoch": 13.572188955422488, + "grad_norm": 0.6588059067726135, + "learning_rate": 4.76374515208473e-05, + "loss": 0.0569, + "step": 5090 + }, + { + "epoch": 13.585495675316034, + "grad_norm": 0.3777833580970764, + "learning_rate": 4.7457334801164775e-05, + "loss": 0.0071, + "step": 5095 + }, + { + "epoch": 13.598802395209582, + "grad_norm": 0.0004773762484546751, + "learning_rate": 4.727745323894976e-05, + "loss": 0.0472, + "step": 5100 + }, + { + "epoch": 13.612109115103127, + "grad_norm": 0.41729772090911865, + "learning_rate": 4.7097807639271683e-05, + "loss": 0.049, + "step": 5105 + }, + { + "epoch": 13.625415834996673, + "grad_norm": 0.00019701718701981008, + "learning_rate": 4.691839880614389e-05, + "loss": 0.0167, + "step": 5110 + }, + { + "epoch": 13.638722554890219, + "grad_norm": 0.540219247341156, + "learning_rate": 4.673922754252002e-05, + "loss": 0.0383, + "step": 5115 + }, + { + "epoch": 13.652029274783766, + "grad_norm": 0.1812252551317215, + "learning_rate": 4.656029465029057e-05, + "loss": 0.04, + "step": 5120 + }, + { + "epoch": 13.665335994677312, + "grad_norm": 0.8494847416877747, + "learning_rate": 4.638160093027908e-05, + "loss": 0.039, + "step": 5125 + }, + { + "epoch": 13.678642714570858, + "grad_norm": 0.5483289957046509, + "learning_rate": 4.620314718223876e-05, + "loss": 0.0531, + "step": 5130 + }, + { + "epoch": 13.691949434464405, + "grad_norm": 1.038064956665039, + "learning_rate": 4.6024934204848745e-05, + "loss": 0.0554, + "step": 5135 + }, + { + "epoch": 13.70525615435795, + "grad_norm": 9.937622962752357e-05, + "learning_rate": 4.5846962795710556e-05, + "loss": 0.0337, + "step": 5140 + }, + { + "epoch": 13.718562874251496, + "grad_norm": 1.4000746011734009, + "learning_rate": 4.566923375134472e-05, + "loss": 0.04, + "step": 5145 + }, + { + "epoch": 13.731869594145044, + "grad_norm": 0.0001771347306203097, + "learning_rate": 4.549174786718684e-05, + "loss": 0.0263, + "step": 5150 + }, + { + "epoch": 13.74517631403859, + "grad_norm": 0.5845703482627869, + "learning_rate": 4.5314505937584417e-05, + "loss": 0.0491, + "step": 5155 + }, + { + "epoch": 13.758483033932135, + "grad_norm": 0.49106454849243164, + "learning_rate": 4.513750875579303e-05, + "loss": 0.1028, + "step": 5160 + }, + { + "epoch": 13.771789753825683, + "grad_norm": 0.4913535416126251, + "learning_rate": 4.4960757113972864e-05, + "loss": 0.0562, + "step": 5165 + }, + { + "epoch": 13.785096473719229, + "grad_norm": 0.6593743562698364, + "learning_rate": 4.4784251803185226e-05, + "loss": 0.0398, + "step": 5170 + }, + { + "epoch": 13.798403193612774, + "grad_norm": 0.0005124734016135335, + "learning_rate": 4.4607993613388976e-05, + "loss": 0.027, + "step": 5175 + }, + { + "epoch": 13.81170991350632, + "grad_norm": 0.0022483225911855698, + "learning_rate": 4.44319833334369e-05, + "loss": 0.0327, + "step": 5180 + }, + { + "epoch": 13.825016633399867, + "grad_norm": 0.0013437450397759676, + "learning_rate": 4.425622175107229e-05, + "loss": 0.0602, + "step": 5185 + }, + { + "epoch": 13.838323353293413, + "grad_norm": 0.46366244554519653, + "learning_rate": 4.4080709652925336e-05, + "loss": 0.0184, + "step": 5190 + }, + { + "epoch": 13.851630073186959, + "grad_norm": 0.00015787413576617837, + "learning_rate": 4.390544782450971e-05, + "loss": 0.072, + "step": 5195 + }, + { + "epoch": 13.864936793080506, + "grad_norm": 0.00038810219848528504, + "learning_rate": 4.373043705021899e-05, + "loss": 0.0315, + "step": 5200 + }, + { + "epoch": 13.878243512974052, + "grad_norm": 0.00165260536596179, + "learning_rate": 4.355567811332311e-05, + "loss": 0.0077, + "step": 5205 + }, + { + "epoch": 13.891550232867598, + "grad_norm": 0.6691223382949829, + "learning_rate": 4.338117179596485e-05, + "loss": 0.0221, + "step": 5210 + }, + { + "epoch": 13.904856952761145, + "grad_norm": 0.0013225431321188807, + "learning_rate": 4.32069188791565e-05, + "loss": 0.0422, + "step": 5215 + }, + { + "epoch": 13.918163672654691, + "grad_norm": 0.0003881505399476737, + "learning_rate": 4.3032920142776125e-05, + "loss": 0.0415, + "step": 5220 + }, + { + "epoch": 13.931470392548237, + "grad_norm": 0.46118324995040894, + "learning_rate": 4.285917636556429e-05, + "loss": 0.0262, + "step": 5225 + }, + { + "epoch": 13.944777112441782, + "grad_norm": 0.0037599815987050533, + "learning_rate": 4.26856883251204e-05, + "loss": 0.0271, + "step": 5230 + }, + { + "epoch": 13.95808383233533, + "grad_norm": 0.5800197720527649, + "learning_rate": 4.251245679789928e-05, + "loss": 0.0481, + "step": 5235 + }, + { + "epoch": 13.971390552228875, + "grad_norm": 0.5036775469779968, + "learning_rate": 4.233948255920785e-05, + "loss": 0.0188, + "step": 5240 + }, + { + "epoch": 13.984697272122421, + "grad_norm": 0.5240902900695801, + "learning_rate": 4.216676638320135e-05, + "loss": 0.0113, + "step": 5245 + }, + { + "epoch": 13.998003992015969, + "grad_norm": 0.0005705656949430704, + "learning_rate": 4.19943090428802e-05, + "loss": 0.01, + "step": 5250 + }, + { + "epoch": 13.998003992015969, + "eval_loss": 1.5540084838867188, + "eval_macro_f1": 68.44116182144285, + "eval_macro_precision": 70.37403623594724, + "eval_macro_recall": 67.0800347172364, + "eval_micro_f1": 87.3877245508982, + "eval_micro_precision": 87.3877245508982, + "eval_micro_recall": 87.3877245508982, + "eval_runtime": 4.7976, + "eval_samples_per_second": 556.948, + "eval_steps_per_second": 34.809, + "step": 5250 + }, + { + "epoch": 14.013306719893546, + "grad_norm": 0.44576719403266907, + "learning_rate": 4.182211131008628e-05, + "loss": 0.0421, + "step": 5255 + }, + { + "epoch": 14.026613439787093, + "grad_norm": 0.0001893198787001893, + "learning_rate": 4.1650173955499585e-05, + "loss": 0.053, + "step": 5260 + }, + { + "epoch": 14.039920159680639, + "grad_norm": 0.00020121457055211067, + "learning_rate": 4.147849774863488e-05, + "loss": 0.0323, + "step": 5265 + }, + { + "epoch": 14.053226879574185, + "grad_norm": 0.37343332171440125, + "learning_rate": 4.1307083457838e-05, + "loss": 0.0444, + "step": 5270 + }, + { + "epoch": 14.066533599467732, + "grad_norm": 0.0002352256269659847, + "learning_rate": 4.1135931850282726e-05, + "loss": 0.0324, + "step": 5275 + }, + { + "epoch": 14.079840319361278, + "grad_norm": 0.4024161398410797, + "learning_rate": 4.096504369196704e-05, + "loss": 0.0149, + "step": 5280 + }, + { + "epoch": 14.093147039254823, + "grad_norm": 0.6625308394432068, + "learning_rate": 4.0794419747709886e-05, + "loss": 0.012, + "step": 5285 + }, + { + "epoch": 14.10645375914837, + "grad_norm": 0.4134993255138397, + "learning_rate": 4.062406078114776e-05, + "loss": 0.0451, + "step": 5290 + }, + { + "epoch": 14.119760479041917, + "grad_norm": 0.600199282169342, + "learning_rate": 4.045396755473121e-05, + "loss": 0.0326, + "step": 5295 + }, + { + "epoch": 14.133067198935462, + "grad_norm": 0.7752711772918701, + "learning_rate": 4.028414082972141e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 14.146373918829008, + "grad_norm": 0.0010675047524273396, + "learning_rate": 4.011458136618681e-05, + "loss": 0.0139, + "step": 5305 + }, + { + "epoch": 14.159680638722556, + "grad_norm": 0.6033095717430115, + "learning_rate": 3.994528992299971e-05, + "loss": 0.026, + "step": 5310 + }, + { + "epoch": 14.172987358616101, + "grad_norm": 0.6383993625640869, + "learning_rate": 3.977626725783291e-05, + "loss": 0.057, + "step": 5315 + }, + { + "epoch": 14.186294078509647, + "grad_norm": 0.008502921089529991, + "learning_rate": 3.960751412715629e-05, + "loss": 0.0275, + "step": 5320 + }, + { + "epoch": 14.199600798403194, + "grad_norm": 0.5096533894538879, + "learning_rate": 3.943903128623335e-05, + "loss": 0.0325, + "step": 5325 + }, + { + "epoch": 14.21290751829674, + "grad_norm": 0.00016405718633905053, + "learning_rate": 3.9270819489117904e-05, + "loss": 0.019, + "step": 5330 + }, + { + "epoch": 14.226214238190286, + "grad_norm": 0.0003083451883867383, + "learning_rate": 3.9102879488650757e-05, + "loss": 0.0253, + "step": 5335 + }, + { + "epoch": 14.239520958083832, + "grad_norm": 0.5092978477478027, + "learning_rate": 3.893521203645618e-05, + "loss": 0.0634, + "step": 5340 + }, + { + "epoch": 14.252827677977379, + "grad_norm": 0.0014424554537981749, + "learning_rate": 3.876781788293876e-05, + "loss": 0.0181, + "step": 5345 + }, + { + "epoch": 14.266134397870925, + "grad_norm": 0.0012855530949309468, + "learning_rate": 3.860069777727983e-05, + "loss": 0.0605, + "step": 5350 + }, + { + "epoch": 14.27944111776447, + "grad_norm": 0.37602004408836365, + "learning_rate": 3.843385246743417e-05, + "loss": 0.0319, + "step": 5355 + }, + { + "epoch": 14.292747837658018, + "grad_norm": 0.21032342314720154, + "learning_rate": 3.826728270012686e-05, + "loss": 0.0396, + "step": 5360 + }, + { + "epoch": 14.306054557551564, + "grad_norm": 0.36076977849006653, + "learning_rate": 3.810098922084958e-05, + "loss": 0.0277, + "step": 5365 + }, + { + "epoch": 14.31936127744511, + "grad_norm": 0.2929077446460724, + "learning_rate": 3.7934972773857634e-05, + "loss": 0.0274, + "step": 5370 + }, + { + "epoch": 14.332667997338657, + "grad_norm": 0.760208249092102, + "learning_rate": 3.776923410216636e-05, + "loss": 0.1027, + "step": 5375 + }, + { + "epoch": 14.345974717232203, + "grad_norm": 0.8322664499282837, + "learning_rate": 3.7603773947547874e-05, + "loss": 0.0629, + "step": 5380 + }, + { + "epoch": 14.359281437125748, + "grad_norm": 0.6654631495475769, + "learning_rate": 3.7438593050527845e-05, + "loss": 0.0398, + "step": 5385 + }, + { + "epoch": 14.372588157019294, + "grad_norm": 5.7775789173319936e-05, + "learning_rate": 3.7273692150382135e-05, + "loss": 0.0, + "step": 5390 + }, + { + "epoch": 14.385894876912841, + "grad_norm": 0.5650693774223328, + "learning_rate": 3.710907198513337e-05, + "loss": 0.0329, + "step": 5395 + }, + { + "epoch": 14.399201596806387, + "grad_norm": 0.00029549148166552186, + "learning_rate": 3.694473329154778e-05, + "loss": 0.0329, + "step": 5400 + }, + { + "epoch": 14.412508316699933, + "grad_norm": 0.008520903065800667, + "learning_rate": 3.678067680513182e-05, + "loss": 0.0216, + "step": 5405 + }, + { + "epoch": 14.42581503659348, + "grad_norm": 0.0003043456526938826, + "learning_rate": 3.661690326012897e-05, + "loss": 0.0083, + "step": 5410 + }, + { + "epoch": 14.439121756487026, + "grad_norm": 0.00047451278078369796, + "learning_rate": 3.645341338951639e-05, + "loss": 0.0026, + "step": 5415 + }, + { + "epoch": 14.452428476380572, + "grad_norm": 0.00206236750818789, + "learning_rate": 3.6290207925001584e-05, + "loss": 0.0085, + "step": 5420 + }, + { + "epoch": 14.46573519627412, + "grad_norm": 0.36209532618522644, + "learning_rate": 3.6127287597019186e-05, + "loss": 0.0063, + "step": 5425 + }, + { + "epoch": 14.479041916167665, + "grad_norm": 0.12500470876693726, + "learning_rate": 3.5964653134727776e-05, + "loss": 0.0282, + "step": 5430 + }, + { + "epoch": 14.49234863606121, + "grad_norm": 0.47154736518859863, + "learning_rate": 3.580230526600639e-05, + "loss": 0.0214, + "step": 5435 + }, + { + "epoch": 14.505655355954758, + "grad_norm": 0.038162682205438614, + "learning_rate": 3.564024471745154e-05, + "loss": 0.0106, + "step": 5440 + }, + { + "epoch": 14.518962075848304, + "grad_norm": 8.110934868454933e-05, + "learning_rate": 3.547847221437372e-05, + "loss": 0.0286, + "step": 5445 + }, + { + "epoch": 14.53226879574185, + "grad_norm": 0.0009972963016480207, + "learning_rate": 3.531698848079425e-05, + "loss": 0.0303, + "step": 5450 + }, + { + "epoch": 14.545575515635395, + "grad_norm": 0.5373179912567139, + "learning_rate": 3.5155794239442184e-05, + "loss": 0.0664, + "step": 5455 + }, + { + "epoch": 14.558882235528943, + "grad_norm": 0.6623542904853821, + "learning_rate": 3.4994890211750754e-05, + "loss": 0.0555, + "step": 5460 + }, + { + "epoch": 14.572188955422488, + "grad_norm": 0.7875820398330688, + "learning_rate": 3.483427711785449e-05, + "loss": 0.0798, + "step": 5465 + }, + { + "epoch": 14.585495675316034, + "grad_norm": 0.5920644998550415, + "learning_rate": 3.467395567658573e-05, + "loss": 0.0409, + "step": 5470 + }, + { + "epoch": 14.598802395209582, + "grad_norm": 0.0001885549136204645, + "learning_rate": 3.45139266054715e-05, + "loss": 0.0487, + "step": 5475 + }, + { + "epoch": 14.612109115103127, + "grad_norm": 0.32315418124198914, + "learning_rate": 3.4354190620730406e-05, + "loss": 0.0674, + "step": 5480 + }, + { + "epoch": 14.625415834996673, + "grad_norm": 5.668059384333901e-05, + "learning_rate": 3.419474843726921e-05, + "loss": 0.0275, + "step": 5485 + }, + { + "epoch": 14.638722554890219, + "grad_norm": 0.0033752620220184326, + "learning_rate": 3.4035600768679855e-05, + "loss": 0.0251, + "step": 5490 + }, + { + "epoch": 14.652029274783766, + "grad_norm": 0.6533347964286804, + "learning_rate": 3.387674832723611e-05, + "loss": 0.0937, + "step": 5495 + }, + { + "epoch": 14.665335994677312, + "grad_norm": 0.0007474491721950471, + "learning_rate": 3.37181918238904e-05, + "loss": 0.0246, + "step": 5500 + }, + { + "epoch": 14.678642714570858, + "grad_norm": 0.6492227911949158, + "learning_rate": 3.3559931968270753e-05, + "loss": 0.0379, + "step": 5505 + }, + { + "epoch": 14.691949434464405, + "grad_norm": 0.0007334169349633157, + "learning_rate": 3.340196946867753e-05, + "loss": 0.0349, + "step": 5510 + }, + { + "epoch": 14.70525615435795, + "grad_norm": 0.00012730961316265166, + "learning_rate": 3.3244305032080183e-05, + "loss": 0.0225, + "step": 5515 + }, + { + "epoch": 14.718562874251496, + "grad_norm": 0.00010851504339370877, + "learning_rate": 3.308693936411421e-05, + "loss": 0.0278, + "step": 5520 + }, + { + "epoch": 14.731869594145044, + "grad_norm": 8.108102338155732e-05, + "learning_rate": 3.292987316907792e-05, + "loss": 0.0298, + "step": 5525 + }, + { + "epoch": 14.74517631403859, + "grad_norm": 1.1605405807495117, + "learning_rate": 3.2773107149929384e-05, + "loss": 0.1119, + "step": 5530 + }, + { + "epoch": 14.758483033932135, + "grad_norm": 0.00032222739537246525, + "learning_rate": 3.2616642008283213e-05, + "loss": 0.0169, + "step": 5535 + }, + { + "epoch": 14.771789753825683, + "grad_norm": 0.0024410784244537354, + "learning_rate": 3.2460478444407374e-05, + "loss": 0.024, + "step": 5540 + }, + { + "epoch": 14.785096473719229, + "grad_norm": 0.028958754613995552, + "learning_rate": 3.230461715722007e-05, + "loss": 0.0064, + "step": 5545 + }, + { + "epoch": 14.798403193612774, + "grad_norm": 0.7407286763191223, + "learning_rate": 3.21490588442868e-05, + "loss": 0.0552, + "step": 5550 + }, + { + "epoch": 14.81170991350632, + "grad_norm": 0.49551257491111755, + "learning_rate": 3.19938042018169e-05, + "loss": 0.0356, + "step": 5555 + }, + { + "epoch": 14.825016633399867, + "grad_norm": 0.7646135687828064, + "learning_rate": 3.1838853924660795e-05, + "loss": 0.0379, + "step": 5560 + }, + { + "epoch": 14.838323353293413, + "grad_norm": 0.000533775077201426, + "learning_rate": 3.1684208706306574e-05, + "loss": 0.0099, + "step": 5565 + }, + { + "epoch": 14.851630073186959, + "grad_norm": 0.414894163608551, + "learning_rate": 3.152986923887703e-05, + "loss": 0.0079, + "step": 5570 + }, + { + "epoch": 14.864936793080506, + "grad_norm": 0.7916562557220459, + "learning_rate": 3.137583621312665e-05, + "loss": 0.0791, + "step": 5575 + }, + { + "epoch": 14.878243512974052, + "grad_norm": 0.4727895259857178, + "learning_rate": 3.1222110318438304e-05, + "loss": 0.0614, + "step": 5580 + }, + { + "epoch": 14.891550232867598, + "grad_norm": 1.6393200159072876, + "learning_rate": 3.1068692242820386e-05, + "loss": 0.0374, + "step": 5585 + }, + { + "epoch": 14.904856952761145, + "grad_norm": 0.47209104895591736, + "learning_rate": 3.0915582672903556e-05, + "loss": 0.0177, + "step": 5590 + }, + { + "epoch": 14.918163672654691, + "grad_norm": 0.45247048139572144, + "learning_rate": 3.076278229393773e-05, + "loss": 0.0435, + "step": 5595 + }, + { + "epoch": 14.931470392548237, + "grad_norm": 1.2875940799713135, + "learning_rate": 3.0610291789789095e-05, + "loss": 0.0685, + "step": 5600 + }, + { + "epoch": 14.944777112441782, + "grad_norm": 0.000212906816159375, + "learning_rate": 3.0458111842936952e-05, + "loss": 0.0407, + "step": 5605 + }, + { + "epoch": 14.95808383233533, + "grad_norm": 0.4576803147792816, + "learning_rate": 3.030624313447067e-05, + "loss": 0.025, + "step": 5610 + }, + { + "epoch": 14.971390552228875, + "grad_norm": 0.6176482439041138, + "learning_rate": 3.0154686344086636e-05, + "loss": 0.0568, + "step": 5615 + }, + { + "epoch": 14.984697272122421, + "grad_norm": 8.765784878050908e-05, + "learning_rate": 3.0003442150085236e-05, + "loss": 0.0316, + "step": 5620 + }, + { + "epoch": 14.998003992015969, + "grad_norm": 0.6814188361167908, + "learning_rate": 2.9852511229367865e-05, + "loss": 0.0439, + "step": 5625 + }, + { + "epoch": 14.998003992015969, + "eval_loss": 1.587638258934021, + "eval_macro_f1": 68.64532527567134, + "eval_macro_precision": 70.73194829515054, + "eval_macro_recall": 67.08167804230526, + "eval_micro_f1": 87.57485029940119, + "eval_micro_precision": 87.57485029940119, + "eval_micro_recall": 87.57485029940119, + "eval_runtime": 4.6604, + "eval_samples_per_second": 573.342, + "eval_steps_per_second": 35.834, + "step": 5625 + }, + { + "epoch": 15.013306719893546, + "grad_norm": 0.00012128758680773899, + "learning_rate": 2.9701894257433826e-05, + "loss": 0.0248, + "step": 5630 + }, + { + "epoch": 15.026613439787093, + "grad_norm": 0.0005043774144724011, + "learning_rate": 2.9551591908377308e-05, + "loss": 0.0379, + "step": 5635 + }, + { + "epoch": 15.039920159680639, + "grad_norm": 0.00016329926438629627, + "learning_rate": 2.9401604854884357e-05, + "loss": 0.0092, + "step": 5640 + }, + { + "epoch": 15.053226879574185, + "grad_norm": 0.39029860496520996, + "learning_rate": 2.925193376822999e-05, + "loss": 0.0385, + "step": 5645 + }, + { + "epoch": 15.066533599467732, + "grad_norm": 0.40662872791290283, + "learning_rate": 2.9102579318274992e-05, + "loss": 0.0364, + "step": 5650 + }, + { + "epoch": 15.079840319361278, + "grad_norm": 0.4595581293106079, + "learning_rate": 2.8953542173463133e-05, + "loss": 0.0221, + "step": 5655 + }, + { + "epoch": 15.093147039254823, + "grad_norm": 0.5248783230781555, + "learning_rate": 2.880482300081797e-05, + "loss": 0.0464, + "step": 5660 + }, + { + "epoch": 15.10645375914837, + "grad_norm": 0.5380111336708069, + "learning_rate": 2.8656422465939993e-05, + "loss": 0.0204, + "step": 5665 + }, + { + "epoch": 15.119760479041917, + "grad_norm": 0.00021512297098524868, + "learning_rate": 2.8508341233003654e-05, + "loss": 0.0201, + "step": 5670 + }, + { + "epoch": 15.133067198935462, + "grad_norm": 0.00022878289746586233, + "learning_rate": 2.8360579964754274e-05, + "loss": 0.034, + "step": 5675 + }, + { + "epoch": 15.146373918829008, + "grad_norm": 0.0010837200097739697, + "learning_rate": 2.8213139322505243e-05, + "loss": 0.0711, + "step": 5680 + }, + { + "epoch": 15.159680638722556, + "grad_norm": 0.0003766873269341886, + "learning_rate": 2.8066019966134904e-05, + "loss": 0.0668, + "step": 5685 + }, + { + "epoch": 15.172987358616101, + "grad_norm": 0.778472900390625, + "learning_rate": 2.7919222554083667e-05, + "loss": 0.0266, + "step": 5690 + }, + { + "epoch": 15.186294078509647, + "grad_norm": 0.6452155709266663, + "learning_rate": 2.7772747743351135e-05, + "loss": 0.0261, + "step": 5695 + }, + { + "epoch": 15.199600798403194, + "grad_norm": 0.5438355803489685, + "learning_rate": 2.7626596189492983e-05, + "loss": 0.0402, + "step": 5700 + }, + { + "epoch": 15.21290751829674, + "grad_norm": 0.0052736373618245125, + "learning_rate": 2.7480768546618264e-05, + "loss": 0.0318, + "step": 5705 + }, + { + "epoch": 15.226214238190286, + "grad_norm": 0.0006670206203125417, + "learning_rate": 2.733526546738624e-05, + "loss": 0.0063, + "step": 5710 + }, + { + "epoch": 15.239520958083832, + "grad_norm": 0.00027919738204218447, + "learning_rate": 2.719008760300359e-05, + "loss": 0.0122, + "step": 5715 + }, + { + "epoch": 15.252827677977379, + "grad_norm": 0.0005136179970577359, + "learning_rate": 2.704523560322153e-05, + "loss": 0.0162, + "step": 5720 + }, + { + "epoch": 15.266134397870925, + "grad_norm": 0.003726151306182146, + "learning_rate": 2.690071011633284e-05, + "loss": 0.0202, + "step": 5725 + }, + { + "epoch": 15.27944111776447, + "grad_norm": 0.9570088982582092, + "learning_rate": 2.6756511789168925e-05, + "loss": 0.0452, + "step": 5730 + }, + { + "epoch": 15.292747837658018, + "grad_norm": 0.5644978284835815, + "learning_rate": 2.6612641267097005e-05, + "loss": 0.0692, + "step": 5735 + }, + { + "epoch": 15.306054557551564, + "grad_norm": 0.6403547525405884, + "learning_rate": 2.6469099194017143e-05, + "loss": 0.0542, + "step": 5740 + }, + { + "epoch": 15.31936127744511, + "grad_norm": 0.001306506572291255, + "learning_rate": 2.6325886212359498e-05, + "loss": 0.0193, + "step": 5745 + }, + { + "epoch": 15.332667997338657, + "grad_norm": 0.5799793601036072, + "learning_rate": 2.618300296308135e-05, + "loss": 0.0315, + "step": 5750 + }, + { + "epoch": 15.345974717232203, + "grad_norm": 0.0005930495681241155, + "learning_rate": 2.6040450085664158e-05, + "loss": 0.0418, + "step": 5755 + }, + { + "epoch": 15.359281437125748, + "grad_norm": 0.6801490783691406, + "learning_rate": 2.589822821811083e-05, + "loss": 0.0348, + "step": 5760 + }, + { + "epoch": 15.372588157019294, + "grad_norm": 0.5044436454772949, + "learning_rate": 2.5756337996942892e-05, + "loss": 0.0147, + "step": 5765 + }, + { + "epoch": 15.385894876912841, + "grad_norm": 0.5001690983772278, + "learning_rate": 2.561478005719743e-05, + "loss": 0.0757, + "step": 5770 + }, + { + "epoch": 15.399201596806387, + "grad_norm": 0.42732563614845276, + "learning_rate": 2.5473555032424533e-05, + "loss": 0.0433, + "step": 5775 + }, + { + "epoch": 15.412508316699933, + "grad_norm": 0.5017266869544983, + "learning_rate": 2.5332663554684222e-05, + "loss": 0.0308, + "step": 5780 + }, + { + "epoch": 15.42581503659348, + "grad_norm": 7.58219975978136e-05, + "learning_rate": 2.519210625454369e-05, + "loss": 0.0003, + "step": 5785 + }, + { + "epoch": 15.439121756487026, + "grad_norm": 0.00032350822584703565, + "learning_rate": 2.5051883761074614e-05, + "loss": 0.0352, + "step": 5790 + }, + { + "epoch": 15.452428476380572, + "grad_norm": 0.5080671906471252, + "learning_rate": 2.491199670185008e-05, + "loss": 0.019, + "step": 5795 + }, + { + "epoch": 15.46573519627412, + "grad_norm": 0.6078925132751465, + "learning_rate": 2.477244570294206e-05, + "loss": 0.0204, + "step": 5800 + }, + { + "epoch": 15.479041916167665, + "grad_norm": 0.44966697692871094, + "learning_rate": 2.4633231388918378e-05, + "loss": 0.0394, + "step": 5805 + }, + { + "epoch": 15.49234863606121, + "grad_norm": 0.0005682120099663734, + "learning_rate": 2.449435438284e-05, + "loss": 0.0241, + "step": 5810 + }, + { + "epoch": 15.505655355954758, + "grad_norm": 0.5577021837234497, + "learning_rate": 2.4355815306258354e-05, + "loss": 0.0319, + "step": 5815 + }, + { + "epoch": 15.518962075848304, + "grad_norm": 0.5054609179496765, + "learning_rate": 2.4217614779212315e-05, + "loss": 0.0302, + "step": 5820 + }, + { + "epoch": 15.53226879574185, + "grad_norm": 0.4755954444408417, + "learning_rate": 2.4079753420225694e-05, + "loss": 0.0302, + "step": 5825 + }, + { + "epoch": 15.545575515635395, + "grad_norm": 1.0765711069107056, + "learning_rate": 2.394223184630422e-05, + "loss": 0.0662, + "step": 5830 + }, + { + "epoch": 15.558882235528943, + "grad_norm": 0.550977349281311, + "learning_rate": 2.3805050672932928e-05, + "loss": 0.0355, + "step": 5835 + }, + { + "epoch": 15.572188955422488, + "grad_norm": 0.5157077312469482, + "learning_rate": 2.366821051407342e-05, + "loss": 0.0552, + "step": 5840 + }, + { + "epoch": 15.585495675316034, + "grad_norm": 0.8127491474151611, + "learning_rate": 2.3531711982161066e-05, + "loss": 0.0612, + "step": 5845 + }, + { + "epoch": 15.598802395209582, + "grad_norm": 0.0020727987866848707, + "learning_rate": 2.339555568810221e-05, + "loss": 0.0252, + "step": 5850 + }, + { + "epoch": 15.612109115103127, + "grad_norm": 0.00020015117479488254, + "learning_rate": 2.32597422412715e-05, + "loss": 0.0351, + "step": 5855 + }, + { + "epoch": 15.625415834996673, + "grad_norm": 0.859960675239563, + "learning_rate": 2.3124272249509226e-05, + "loss": 0.0501, + "step": 5860 + }, + { + "epoch": 15.638722554890219, + "grad_norm": 0.5178987979888916, + "learning_rate": 2.2989146319118425e-05, + "loss": 0.0377, + "step": 5865 + }, + { + "epoch": 15.652029274783766, + "grad_norm": 0.4624478220939636, + "learning_rate": 2.2854365054862382e-05, + "loss": 0.0519, + "step": 5870 + }, + { + "epoch": 15.665335994677312, + "grad_norm": 0.5773726105690002, + "learning_rate": 2.2719929059961698e-05, + "loss": 0.0177, + "step": 5875 + }, + { + "epoch": 15.678642714570858, + "grad_norm": 0.5505730509757996, + "learning_rate": 2.2585838936091754e-05, + "loss": 0.0285, + "step": 5880 + }, + { + "epoch": 15.691949434464405, + "grad_norm": 0.9743322730064392, + "learning_rate": 2.2452095283380003e-05, + "loss": 0.0165, + "step": 5885 + }, + { + "epoch": 15.70525615435795, + "grad_norm": 0.46247196197509766, + "learning_rate": 2.2318698700403174e-05, + "loss": 0.0538, + "step": 5890 + }, + { + "epoch": 15.718562874251496, + "grad_norm": 0.40537363290786743, + "learning_rate": 2.2185649784184746e-05, + "loss": 0.0436, + "step": 5895 + }, + { + "epoch": 15.731869594145044, + "grad_norm": 0.00572928274050355, + "learning_rate": 2.2052949130192136e-05, + "loss": 0.0434, + "step": 5900 + }, + { + "epoch": 15.74517631403859, + "grad_norm": 0.3674880862236023, + "learning_rate": 2.192059733233408e-05, + "loss": 0.0391, + "step": 5905 + }, + { + "epoch": 15.758483033932135, + "grad_norm": 0.4856221079826355, + "learning_rate": 2.178859498295809e-05, + "loss": 0.0211, + "step": 5910 + }, + { + "epoch": 15.771789753825683, + "grad_norm": 0.00038009166019037366, + "learning_rate": 2.1656942672847568e-05, + "loss": 0.0469, + "step": 5915 + }, + { + "epoch": 15.785096473719229, + "grad_norm": 0.14428001642227173, + "learning_rate": 2.152564099121944e-05, + "loss": 0.0424, + "step": 5920 + }, + { + "epoch": 15.798403193612774, + "grad_norm": 8.240526949521154e-05, + "learning_rate": 2.139469052572127e-05, + "loss": 0.0093, + "step": 5925 + }, + { + "epoch": 15.81170991350632, + "grad_norm": 0.5241426825523376, + "learning_rate": 2.1264091862428737e-05, + "loss": 0.0186, + "step": 5930 + }, + { + "epoch": 15.825016633399867, + "grad_norm": 0.45298877358436584, + "learning_rate": 2.113384558584307e-05, + "loss": 0.0947, + "step": 5935 + }, + { + "epoch": 15.838323353293413, + "grad_norm": 0.5497854948043823, + "learning_rate": 2.1003952278888382e-05, + "loss": 0.0561, + "step": 5940 + }, + { + "epoch": 15.851630073186959, + "grad_norm": 0.0001928492565639317, + "learning_rate": 2.087441252290897e-05, + "loss": 0.0088, + "step": 5945 + }, + { + "epoch": 15.864936793080506, + "grad_norm": 0.46321219205856323, + "learning_rate": 2.074522689766686e-05, + "loss": 0.0348, + "step": 5950 + }, + { + "epoch": 15.878243512974052, + "grad_norm": 0.4391571283340454, + "learning_rate": 2.0616395981339075e-05, + "loss": 0.0336, + "step": 5955 + }, + { + "epoch": 15.891550232867598, + "grad_norm": 0.5629188418388367, + "learning_rate": 2.0487920350515212e-05, + "loss": 0.0299, + "step": 5960 + }, + { + "epoch": 15.904856952761145, + "grad_norm": 0.00014769968402106315, + "learning_rate": 2.0359800580194764e-05, + "loss": 0.0195, + "step": 5965 + }, + { + "epoch": 15.918163672654691, + "grad_norm": 0.00027674203738570213, + "learning_rate": 2.0232037243784475e-05, + "loss": 0.0076, + "step": 5970 + }, + { + "epoch": 15.931470392548237, + "grad_norm": 0.4338863790035248, + "learning_rate": 2.010463091309587e-05, + "loss": 0.0457, + "step": 5975 + }, + { + "epoch": 15.944777112441782, + "grad_norm": 0.7306026220321655, + "learning_rate": 1.9977582158342754e-05, + "loss": 0.0369, + "step": 5980 + }, + { + "epoch": 15.95808383233533, + "grad_norm": 0.731495201587677, + "learning_rate": 1.985089154813846e-05, + "loss": 0.0672, + "step": 5985 + }, + { + "epoch": 15.971390552228875, + "grad_norm": 8.390431321458891e-05, + "learning_rate": 1.9724559649493567e-05, + "loss": 0.0107, + "step": 5990 + }, + { + "epoch": 15.984697272122421, + "grad_norm": 0.00010008139361161739, + "learning_rate": 1.95985870278131e-05, + "loss": 0.0474, + "step": 5995 + }, + { + "epoch": 15.998003992015969, + "grad_norm": 0.9425669312477112, + "learning_rate": 1.947297424689414e-05, + "loss": 0.0628, + "step": 6000 + }, + { + "epoch": 15.998003992015969, + "eval_loss": 1.6210857629776, + "eval_macro_f1": 67.81923557787161, + "eval_macro_precision": 70.7270600967691, + "eval_macro_recall": 65.89991990784617, + "eval_micro_f1": 87.4625748502994, + "eval_micro_precision": 87.4625748502994, + "eval_micro_recall": 87.4625748502994, + "eval_runtime": 5.0132, + "eval_samples_per_second": 532.989, + "eval_steps_per_second": 33.312, + "step": 6000 + }, + { + "epoch": 16.013306719893546, + "grad_norm": 7.936586189316586e-05, + "learning_rate": 1.9347721868923374e-05, + "loss": 0.043, + "step": 6005 + }, + { + "epoch": 16.02661343978709, + "grad_norm": 0.6429644227027893, + "learning_rate": 1.922283045447436e-05, + "loss": 0.0282, + "step": 6010 + }, + { + "epoch": 16.039920159680637, + "grad_norm": 0.0010867537930607796, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.0, + "step": 6015 + }, + { + "epoch": 16.053226879574186, + "grad_norm": 0.000851222372148186, + "learning_rate": 1.8974132750356153e-05, + "loss": 0.0348, + "step": 6020 + }, + { + "epoch": 16.066533599467732, + "grad_norm": 0.0010033567668870091, + "learning_rate": 1.8850327573746585e-05, + "loss": 0.0124, + "step": 6025 + }, + { + "epoch": 16.079840319361278, + "grad_norm": 0.5936465859413147, + "learning_rate": 1.8726885586773212e-05, + "loss": 0.0336, + "step": 6030 + }, + { + "epoch": 16.093147039254823, + "grad_norm": 0.0009507182403467596, + "learning_rate": 1.8603807341907098e-05, + "loss": 0.0458, + "step": 6035 + }, + { + "epoch": 16.10645375914837, + "grad_norm": 0.42788538336753845, + "learning_rate": 1.8481093389991468e-05, + "loss": 0.0257, + "step": 6040 + }, + { + "epoch": 16.119760479041915, + "grad_norm": 0.004346577450633049, + "learning_rate": 1.835874428023905e-05, + "loss": 0.023, + "step": 6045 + }, + { + "epoch": 16.133067198935464, + "grad_norm": 5.493039861903526e-05, + "learning_rate": 1.8236760560229714e-05, + "loss": 0.0208, + "step": 6050 + }, + { + "epoch": 16.14637391882901, + "grad_norm": 4.223493306199089e-05, + "learning_rate": 1.8115142775908045e-05, + "loss": 0.0375, + "step": 6055 + }, + { + "epoch": 16.159680638722556, + "grad_norm": 0.0001787821383913979, + "learning_rate": 1.7993891471580893e-05, + "loss": 0.0135, + "step": 6060 + }, + { + "epoch": 16.1729873586161, + "grad_norm": 0.5778005123138428, + "learning_rate": 1.7873007189914815e-05, + "loss": 0.0554, + "step": 6065 + }, + { + "epoch": 16.186294078509647, + "grad_norm": 3.0017821700312197e-05, + "learning_rate": 1.775249047193377e-05, + "loss": 0.0419, + "step": 6070 + }, + { + "epoch": 16.199600798403193, + "grad_norm": 0.6397530436515808, + "learning_rate": 1.763234185701673e-05, + "loss": 0.0409, + "step": 6075 + }, + { + "epoch": 16.21290751829674, + "grad_norm": 1.0274336338043213, + "learning_rate": 1.7512561882895108e-05, + "loss": 0.0525, + "step": 6080 + }, + { + "epoch": 16.226214238190288, + "grad_norm": 0.34178104996681213, + "learning_rate": 1.739315108565053e-05, + "loss": 0.0264, + "step": 6085 + }, + { + "epoch": 16.239520958083833, + "grad_norm": 5.178775609238073e-05, + "learning_rate": 1.7274109999712295e-05, + "loss": 0.0202, + "step": 6090 + }, + { + "epoch": 16.25282767797738, + "grad_norm": 0.4585123062133789, + "learning_rate": 1.7155439157855037e-05, + "loss": 0.0302, + "step": 6095 + }, + { + "epoch": 16.266134397870925, + "grad_norm": 0.0011848661815747619, + "learning_rate": 1.70371390911964e-05, + "loss": 0.0435, + "step": 6100 + }, + { + "epoch": 16.27944111776447, + "grad_norm": 4.1747145587578416e-05, + "learning_rate": 1.6919210329194533e-05, + "loss": 0.0183, + "step": 6105 + }, + { + "epoch": 16.292747837658016, + "grad_norm": 5.5612043070141226e-05, + "learning_rate": 1.6801653399645866e-05, + "loss": 0.0358, + "step": 6110 + }, + { + "epoch": 16.306054557551562, + "grad_norm": 8.079227700363845e-05, + "learning_rate": 1.668446882868262e-05, + "loss": 0.0414, + "step": 6115 + }, + { + "epoch": 16.31936127744511, + "grad_norm": 4.4177184463478625e-05, + "learning_rate": 1.6567657140770475e-05, + "loss": 0.0208, + "step": 6120 + }, + { + "epoch": 16.332667997338657, + "grad_norm": 4.0801765862852335e-05, + "learning_rate": 1.6451218858706374e-05, + "loss": 0.0323, + "step": 6125 + }, + { + "epoch": 16.345974717232203, + "grad_norm": 0.5345805287361145, + "learning_rate": 1.63351545036159e-05, + "loss": 0.0371, + "step": 6130 + }, + { + "epoch": 16.35928143712575, + "grad_norm": 2.5667854060884565e-05, + "learning_rate": 1.621946459495127e-05, + "loss": 0.01, + "step": 6135 + }, + { + "epoch": 16.372588157019294, + "grad_norm": 6.979352474445477e-05, + "learning_rate": 1.610414965048874e-05, + "loss": 0.0508, + "step": 6140 + }, + { + "epoch": 16.38589487691284, + "grad_norm": 0.6311120986938477, + "learning_rate": 1.5989210186326388e-05, + "loss": 0.0584, + "step": 6145 + }, + { + "epoch": 16.39920159680639, + "grad_norm": 0.00010010002733906731, + "learning_rate": 1.587464671688187e-05, + "loss": 0.0332, + "step": 6150 + }, + { + "epoch": 16.412508316699935, + "grad_norm": 1.4140156507492065, + "learning_rate": 1.5760459754890068e-05, + "loss": 0.0725, + "step": 6155 + }, + { + "epoch": 16.42581503659348, + "grad_norm": 0.00011153733794344589, + "learning_rate": 1.5646649811400705e-05, + "loss": 0.0245, + "step": 6160 + }, + { + "epoch": 16.439121756487026, + "grad_norm": 0.5203162431716919, + "learning_rate": 1.553321739577619e-05, + "loss": 0.0657, + "step": 6165 + }, + { + "epoch": 16.45242847638057, + "grad_norm": 0.7140324711799622, + "learning_rate": 1.542016301568926e-05, + "loss": 0.0483, + "step": 6170 + }, + { + "epoch": 16.465735196274117, + "grad_norm": 0.6292963027954102, + "learning_rate": 1.5307487177120772e-05, + "loss": 0.0601, + "step": 6175 + }, + { + "epoch": 16.479041916167663, + "grad_norm": 3.154838486807421e-05, + "learning_rate": 1.5195190384357404e-05, + "loss": 0.0624, + "step": 6180 + }, + { + "epoch": 16.492348636061212, + "grad_norm": 0.001199195859953761, + "learning_rate": 1.5083273139989352e-05, + "loss": 0.0111, + "step": 6185 + }, + { + "epoch": 16.505655355954758, + "grad_norm": 5.260263060336001e-05, + "learning_rate": 1.4971735944908106e-05, + "loss": 0.0191, + "step": 6190 + }, + { + "epoch": 16.518962075848304, + "grad_norm": 0.5854066610336304, + "learning_rate": 1.4860579298304312e-05, + "loss": 0.0253, + "step": 6195 + }, + { + "epoch": 16.53226879574185, + "grad_norm": 0.0001449223782401532, + "learning_rate": 1.4749803697665366e-05, + "loss": 0.0163, + "step": 6200 + }, + { + "epoch": 16.545575515635395, + "grad_norm": 0.5943820476531982, + "learning_rate": 1.463940963877335e-05, + "loss": 0.0181, + "step": 6205 + }, + { + "epoch": 16.55888223552894, + "grad_norm": 0.5208426713943481, + "learning_rate": 1.4529397615702656e-05, + "loss": 0.0566, + "step": 6210 + }, + { + "epoch": 16.57218895542249, + "grad_norm": 4.5326713006943464e-05, + "learning_rate": 1.4419768120817889e-05, + "loss": 0.0406, + "step": 6215 + }, + { + "epoch": 16.585495675316036, + "grad_norm": 1.1962686777114868, + "learning_rate": 1.4310521644771658e-05, + "loss": 0.0561, + "step": 6220 + }, + { + "epoch": 16.59880239520958, + "grad_norm": 5.617663555312902e-05, + "learning_rate": 1.4201658676502294e-05, + "loss": 0.043, + "step": 6225 + }, + { + "epoch": 16.612109115103127, + "grad_norm": 0.8843726515769958, + "learning_rate": 1.4093179703231784e-05, + "loss": 0.0501, + "step": 6230 + }, + { + "epoch": 16.625415834996673, + "grad_norm": 0.5637995600700378, + "learning_rate": 1.3985085210463477e-05, + "loss": 0.0111, + "step": 6235 + }, + { + "epoch": 16.63872255489022, + "grad_norm": 0.00020396114268805832, + "learning_rate": 1.3877375681979943e-05, + "loss": 0.0233, + "step": 6240 + }, + { + "epoch": 16.652029274783764, + "grad_norm": 0.14145420491695404, + "learning_rate": 1.3770051599840905e-05, + "loss": 0.0026, + "step": 6245 + }, + { + "epoch": 16.665335994677314, + "grad_norm": 0.6490848660469055, + "learning_rate": 1.3663113444380905e-05, + "loss": 0.0318, + "step": 6250 + }, + { + "epoch": 16.67864271457086, + "grad_norm": 5.598918869509362e-05, + "learning_rate": 1.3556561694207338e-05, + "loss": 0.0113, + "step": 6255 + }, + { + "epoch": 16.691949434464405, + "grad_norm": 0.5942937731742859, + "learning_rate": 1.3450396826198142e-05, + "loss": 0.0185, + "step": 6260 + }, + { + "epoch": 16.70525615435795, + "grad_norm": 3.837359690805897e-05, + "learning_rate": 1.3344619315499774e-05, + "loss": 0.0172, + "step": 6265 + }, + { + "epoch": 16.718562874251496, + "grad_norm": 5.086205783300102e-05, + "learning_rate": 1.3239229635525074e-05, + "loss": 0.0171, + "step": 6270 + }, + { + "epoch": 16.731869594145042, + "grad_norm": 0.0009596463642083108, + "learning_rate": 1.3134228257951142e-05, + "loss": 0.0107, + "step": 6275 + }, + { + "epoch": 16.745176314038588, + "grad_norm": 0.807654619216919, + "learning_rate": 1.302961565271713e-05, + "loss": 0.0477, + "step": 6280 + }, + { + "epoch": 16.758483033932137, + "grad_norm": 0.7362765073776245, + "learning_rate": 1.2925392288022298e-05, + "loss": 0.0635, + "step": 6285 + }, + { + "epoch": 16.771789753825683, + "grad_norm": 3.490053495625034e-05, + "learning_rate": 1.2821558630323772e-05, + "loss": 0.0254, + "step": 6290 + }, + { + "epoch": 16.78509647371923, + "grad_norm": 4.203675780445337e-05, + "learning_rate": 1.2718115144334574e-05, + "loss": 0.0474, + "step": 6295 + }, + { + "epoch": 16.798403193612774, + "grad_norm": 0.419802188873291, + "learning_rate": 1.2615062293021507e-05, + "loss": 0.0356, + "step": 6300 + }, + { + "epoch": 16.81170991350632, + "grad_norm": 0.6541265249252319, + "learning_rate": 1.251240053760302e-05, + "loss": 0.0469, + "step": 6305 + }, + { + "epoch": 16.825016633399866, + "grad_norm": 0.5618201494216919, + "learning_rate": 1.2410130337547177e-05, + "loss": 0.0295, + "step": 6310 + }, + { + "epoch": 16.83832335329341, + "grad_norm": 0.6275516152381897, + "learning_rate": 1.230825215056971e-05, + "loss": 0.0205, + "step": 6315 + }, + { + "epoch": 16.85163007318696, + "grad_norm": 0.6804540157318115, + "learning_rate": 1.2206766432631766e-05, + "loss": 0.0408, + "step": 6320 + }, + { + "epoch": 16.864936793080506, + "grad_norm": 0.5253809094429016, + "learning_rate": 1.2105673637938053e-05, + "loss": 0.0787, + "step": 6325 + }, + { + "epoch": 16.878243512974052, + "grad_norm": 0.45566126704216003, + "learning_rate": 1.2004974218934695e-05, + "loss": 0.0512, + "step": 6330 + }, + { + "epoch": 16.891550232867598, + "grad_norm": 0.46057701110839844, + "learning_rate": 1.1904668626307226e-05, + "loss": 0.0786, + "step": 6335 + }, + { + "epoch": 16.904856952761143, + "grad_norm": 6.161674536997452e-05, + "learning_rate": 1.1804757308978654e-05, + "loss": 0.0475, + "step": 6340 + }, + { + "epoch": 16.91816367265469, + "grad_norm": 0.00021025777095928788, + "learning_rate": 1.1705240714107302e-05, + "loss": 0.0294, + "step": 6345 + }, + { + "epoch": 16.93147039254824, + "grad_norm": 0.00022641375835519284, + "learning_rate": 1.1606119287084983e-05, + "loss": 0.0178, + "step": 6350 + }, + { + "epoch": 16.944777112441784, + "grad_norm": 9.136064181802794e-05, + "learning_rate": 1.1507393471534833e-05, + "loss": 0.0586, + "step": 6355 + }, + { + "epoch": 16.95808383233533, + "grad_norm": 0.0001223825238412246, + "learning_rate": 1.1409063709309442e-05, + "loss": 0.0377, + "step": 6360 + }, + { + "epoch": 16.971390552228875, + "grad_norm": 0.8817633986473083, + "learning_rate": 1.1311130440488848e-05, + "loss": 0.0462, + "step": 6365 + }, + { + "epoch": 16.98469727212242, + "grad_norm": 0.43441712856292725, + "learning_rate": 1.1213594103378588e-05, + "loss": 0.0405, + "step": 6370 + }, + { + "epoch": 16.998003992015967, + "grad_norm": 0.0001357852015644312, + "learning_rate": 1.1116455134507664e-05, + "loss": 0.0, + "step": 6375 + }, + { + "epoch": 16.998003992015967, + "eval_loss": 1.636362910270691, + "eval_macro_f1": 68.57243488427568, + "eval_macro_precision": 70.72150848759064, + "eval_macro_recall": 66.973441952334, + "eval_micro_f1": 87.57485029940119, + "eval_micro_precision": 87.57485029940119, + "eval_micro_recall": 87.57485029940119, + "eval_runtime": 4.8106, + "eval_samples_per_second": 555.437, + "eval_steps_per_second": 34.715, + "step": 6375 + }, + { + "epoch": 17.013306719893546, + "grad_norm": 0.001735365716740489, + "learning_rate": 1.1019713968626632e-05, + "loss": 0.0342, + "step": 6380 + }, + { + "epoch": 17.02661343978709, + "grad_norm": 0.0005165593465790153, + "learning_rate": 1.0923371038705677e-05, + "loss": 0.0, + "step": 6385 + }, + { + "epoch": 17.039920159680637, + "grad_norm": 0.00010086892143590376, + "learning_rate": 1.0827426775932658e-05, + "loss": 0.0599, + "step": 6390 + }, + { + "epoch": 17.053226879574186, + "grad_norm": 7.885311060817912e-05, + "learning_rate": 1.0731881609711247e-05, + "loss": 0.0247, + "step": 6395 + }, + { + "epoch": 17.066533599467732, + "grad_norm": 0.5551512241363525, + "learning_rate": 1.0636735967658784e-05, + "loss": 0.0672, + "step": 6400 + }, + { + "epoch": 17.079840319361278, + "grad_norm": 5.913793938816525e-05, + "learning_rate": 1.054199027560463e-05, + "loss": 0.0333, + "step": 6405 + }, + { + "epoch": 17.093147039254823, + "grad_norm": 0.0009562379564158618, + "learning_rate": 1.0447644957588165e-05, + "loss": 0.0277, + "step": 6410 + }, + { + "epoch": 17.10645375914837, + "grad_norm": 0.8013795018196106, + "learning_rate": 1.0353700435856772e-05, + "loss": 0.0465, + "step": 6415 + }, + { + "epoch": 17.119760479041915, + "grad_norm": 1.1875650882720947, + "learning_rate": 1.026015713086418e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 17.133067198935464, + "grad_norm": 3.995191218564287e-05, + "learning_rate": 1.0167015461268304e-05, + "loss": 0.0602, + "step": 6425 + }, + { + "epoch": 17.14637391882901, + "grad_norm": 0.00024099642178043723, + "learning_rate": 1.0074275843929626e-05, + "loss": 0.0086, + "step": 6430 + }, + { + "epoch": 17.159680638722556, + "grad_norm": 0.9614741206169128, + "learning_rate": 9.98193869390922e-06, + "loss": 0.0321, + "step": 6435 + }, + { + "epoch": 17.1729873586161, + "grad_norm": 0.531378984451294, + "learning_rate": 9.890004424466825e-06, + "loss": 0.0268, + "step": 6440 + }, + { + "epoch": 17.186294078509647, + "grad_norm": 0.5041787028312683, + "learning_rate": 9.798473447059154e-06, + "loss": 0.0498, + "step": 6445 + }, + { + "epoch": 17.199600798403193, + "grad_norm": 0.0004790777456946671, + "learning_rate": 9.707346171337894e-06, + "loss": 0.0184, + "step": 6450 + }, + { + "epoch": 17.21290751829674, + "grad_norm": 0.00019066996173933148, + "learning_rate": 9.616623005147951e-06, + "loss": 0.0333, + "step": 6455 + }, + { + "epoch": 17.226214238190288, + "grad_norm": 5.8852790971286595e-05, + "learning_rate": 9.526304354525672e-06, + "loss": 0.008, + "step": 6460 + }, + { + "epoch": 17.239520958083833, + "grad_norm": 1.407489538192749, + "learning_rate": 9.436390623696911e-06, + "loss": 0.0333, + "step": 6465 + }, + { + "epoch": 17.25282767797738, + "grad_norm": 0.6663693189620972, + "learning_rate": 9.346882215075348e-06, + "loss": 0.0217, + "step": 6470 + }, + { + "epoch": 17.266134397870925, + "grad_norm": 0.749689519405365, + "learning_rate": 9.257779529260557e-06, + "loss": 0.0431, + "step": 6475 + }, + { + "epoch": 17.27944111776447, + "grad_norm": 0.00026045890990644693, + "learning_rate": 9.16908296503628e-06, + "loss": 0.0085, + "step": 6480 + }, + { + "epoch": 17.292747837658016, + "grad_norm": 0.8062569499015808, + "learning_rate": 9.080792919368696e-06, + "loss": 0.044, + "step": 6485 + }, + { + "epoch": 17.306054557551562, + "grad_norm": 0.001404839102178812, + "learning_rate": 8.992909787404602e-06, + "loss": 0.017, + "step": 6490 + }, + { + "epoch": 17.31936127744511, + "grad_norm": 0.41923534870147705, + "learning_rate": 8.905433962469489e-06, + "loss": 0.0264, + "step": 6495 + }, + { + "epoch": 17.332667997338657, + "grad_norm": 0.00013469730038195848, + "learning_rate": 8.818365836066101e-06, + "loss": 0.0, + "step": 6500 + }, + { + "epoch": 17.345974717232203, + "grad_norm": 0.6122064590454102, + "learning_rate": 8.73170579787237e-06, + "loss": 0.0114, + "step": 6505 + }, + { + "epoch": 17.35928143712575, + "grad_norm": 1.719806432723999, + "learning_rate": 8.645454235739903e-06, + "loss": 0.045, + "step": 6510 + }, + { + "epoch": 17.372588157019294, + "grad_norm": 0.47664159536361694, + "learning_rate": 8.559611535692135e-06, + "loss": 0.0251, + "step": 6515 + }, + { + "epoch": 17.38589487691284, + "grad_norm": 9.255381883122027e-05, + "learning_rate": 8.474178081922524e-06, + "loss": 0.0424, + "step": 6520 + }, + { + "epoch": 17.39920159680639, + "grad_norm": 2.3919166778796352e-05, + "learning_rate": 8.38915425679304e-06, + "loss": 0.0126, + "step": 6525 + }, + { + "epoch": 17.412508316699935, + "grad_norm": 0.0001049041748046875, + "learning_rate": 8.304540440832298e-06, + "loss": 0.0189, + "step": 6530 + }, + { + "epoch": 17.42581503659348, + "grad_norm": 0.6109215617179871, + "learning_rate": 8.22033701273387e-06, + "loss": 0.0216, + "step": 6535 + }, + { + "epoch": 17.439121756487026, + "grad_norm": 9.519642480881885e-05, + "learning_rate": 8.13654434935467e-06, + "loss": 0.0244, + "step": 6540 + }, + { + "epoch": 17.45242847638057, + "grad_norm": 0.3934282660484314, + "learning_rate": 8.053162825713134e-06, + "loss": 0.0335, + "step": 6545 + }, + { + "epoch": 17.465735196274117, + "grad_norm": 0.0003310267929919064, + "learning_rate": 7.970192814987675e-06, + "loss": 0.0415, + "step": 6550 + }, + { + "epoch": 17.479041916167663, + "grad_norm": 6.156332528917119e-05, + "learning_rate": 7.887634688515e-06, + "loss": 0.0486, + "step": 6555 + }, + { + "epoch": 17.492348636061212, + "grad_norm": 0.0003013143432326615, + "learning_rate": 7.805488815788286e-06, + "loss": 0.0135, + "step": 6560 + }, + { + "epoch": 17.505655355954758, + "grad_norm": 5.3178453526925296e-05, + "learning_rate": 7.72375556445577e-06, + "loss": 0.0368, + "step": 6565 + }, + { + "epoch": 17.518962075848304, + "grad_norm": 3.9541144360555336e-05, + "learning_rate": 7.642435300318907e-06, + "loss": 0.0285, + "step": 6570 + }, + { + "epoch": 17.53226879574185, + "grad_norm": 0.5803095102310181, + "learning_rate": 7.561528387330796e-06, + "loss": 0.0201, + "step": 6575 + }, + { + "epoch": 17.545575515635395, + "grad_norm": 0.00011860304948640987, + "learning_rate": 7.48103518759462e-06, + "loss": 0.0253, + "step": 6580 + }, + { + "epoch": 17.55888223552894, + "grad_norm": 0.0002037498779827729, + "learning_rate": 7.400956061361974e-06, + "loss": 0.0279, + "step": 6585 + }, + { + "epoch": 17.57218895542249, + "grad_norm": 0.5323101878166199, + "learning_rate": 7.3212913670311355e-06, + "loss": 0.055, + "step": 6590 + }, + { + "epoch": 17.585495675316036, + "grad_norm": 0.00022871862165629864, + "learning_rate": 7.242041461145688e-06, + "loss": 0.0169, + "step": 6595 + }, + { + "epoch": 17.59880239520958, + "grad_norm": 0.0006196050089783967, + "learning_rate": 7.163206698392744e-06, + "loss": 0.0364, + "step": 6600 + }, + { + "epoch": 17.612109115103127, + "grad_norm": 0.7550682425498962, + "learning_rate": 7.084787431601436e-06, + "loss": 0.0423, + "step": 6605 + }, + { + "epoch": 17.625415834996673, + "grad_norm": 0.4589405655860901, + "learning_rate": 7.006784011741374e-06, + "loss": 0.0273, + "step": 6610 + }, + { + "epoch": 17.63872255489022, + "grad_norm": 0.7806951403617859, + "learning_rate": 6.929196787920899e-06, + "loss": 0.0095, + "step": 6615 + }, + { + "epoch": 17.652029274783764, + "grad_norm": 1.110194444656372, + "learning_rate": 6.852026107385756e-06, + "loss": 0.0538, + "step": 6620 + }, + { + "epoch": 17.665335994677314, + "grad_norm": 4.437299503479153e-05, + "learning_rate": 6.7752723155174226e-06, + "loss": 0.0419, + "step": 6625 + }, + { + "epoch": 17.67864271457086, + "grad_norm": 0.8215072154998779, + "learning_rate": 6.698935755831492e-06, + "loss": 0.0946, + "step": 6630 + }, + { + "epoch": 17.691949434464405, + "grad_norm": 0.46991631388664246, + "learning_rate": 6.623016769976309e-06, + "loss": 0.0473, + "step": 6635 + }, + { + "epoch": 17.70525615435795, + "grad_norm": 4.036861355416477e-05, + "learning_rate": 6.547515697731244e-06, + "loss": 0.0444, + "step": 6640 + }, + { + "epoch": 17.718562874251496, + "grad_norm": 0.4827462136745453, + "learning_rate": 6.472432877005341e-06, + "loss": 0.0585, + "step": 6645 + }, + { + "epoch": 17.731869594145042, + "grad_norm": 5.3903575462754816e-05, + "learning_rate": 6.397768643835755e-06, + "loss": 0.0099, + "step": 6650 + }, + { + "epoch": 17.745176314038588, + "grad_norm": 0.4768927991390228, + "learning_rate": 6.323523332386172e-06, + "loss": 0.0252, + "step": 6655 + }, + { + "epoch": 17.758483033932137, + "grad_norm": 1.0958563089370728, + "learning_rate": 6.2496972749453766e-06, + "loss": 0.0376, + "step": 6660 + }, + { + "epoch": 17.771789753825683, + "grad_norm": 0.0001639130205148831, + "learning_rate": 6.176290801925821e-06, + "loss": 0.0652, + "step": 6665 + }, + { + "epoch": 17.78509647371923, + "grad_norm": 0.7531500458717346, + "learning_rate": 6.103304241862007e-06, + "loss": 0.067, + "step": 6670 + }, + { + "epoch": 17.798403193612774, + "grad_norm": 0.6905338764190674, + "learning_rate": 6.030737921409169e-06, + "loss": 0.0175, + "step": 6675 + }, + { + "epoch": 17.81170991350632, + "grad_norm": 0.8327066898345947, + "learning_rate": 5.95859216534167e-06, + "loss": 0.0277, + "step": 6680 + }, + { + "epoch": 17.825016633399866, + "grad_norm": 0.8314510583877563, + "learning_rate": 5.886867296551646e-06, + "loss": 0.0669, + "step": 6685 + }, + { + "epoch": 17.83832335329341, + "grad_norm": 1.0047413110733032, + "learning_rate": 5.8155636360475385e-06, + "loss": 0.0529, + "step": 6690 + }, + { + "epoch": 17.85163007318696, + "grad_norm": 0.4971298575401306, + "learning_rate": 5.74468150295262e-06, + "loss": 0.0517, + "step": 6695 + }, + { + "epoch": 17.864936793080506, + "grad_norm": 6.307179864961654e-05, + "learning_rate": 5.674221214503639e-06, + "loss": 0.0346, + "step": 6700 + }, + { + "epoch": 17.878243512974052, + "grad_norm": 4.990035813534632e-05, + "learning_rate": 5.604183086049342e-06, + "loss": 0.0455, + "step": 6705 + }, + { + "epoch": 17.891550232867598, + "grad_norm": 0.7055619955062866, + "learning_rate": 5.534567431049009e-06, + "loss": 0.0293, + "step": 6710 + }, + { + "epoch": 17.904856952761143, + "grad_norm": 2.4250119167845696e-05, + "learning_rate": 5.465374561071212e-06, + "loss": 0.0179, + "step": 6715 + }, + { + "epoch": 17.91816367265469, + "grad_norm": 0.0003098284068983048, + "learning_rate": 5.396604785792281e-06, + "loss": 0.0296, + "step": 6720 + }, + { + "epoch": 17.93147039254824, + "grad_norm": 0.161666139960289, + "learning_rate": 5.328258412994958e-06, + "loss": 0.0405, + "step": 6725 + }, + { + "epoch": 17.944777112441784, + "grad_norm": 0.5785791873931885, + "learning_rate": 5.26033574856708e-06, + "loss": 0.0873, + "step": 6730 + }, + { + "epoch": 17.95808383233533, + "grad_norm": 0.1611766219139099, + "learning_rate": 5.192837096500058e-06, + "loss": 0.0396, + "step": 6735 + }, + { + "epoch": 17.971390552228875, + "grad_norm": 3.467617352725938e-05, + "learning_rate": 5.125762758887687e-06, + "loss": 0.0195, + "step": 6740 + }, + { + "epoch": 17.98469727212242, + "grad_norm": 0.00013225249131210148, + "learning_rate": 5.059113035924712e-06, + "loss": 0.0179, + "step": 6745 + }, + { + "epoch": 17.998003992015967, + "grad_norm": 0.4885573983192444, + "learning_rate": 4.992888225905468e-06, + "loss": 0.0431, + "step": 6750 + }, + { + "epoch": 17.998003992015967, + "eval_loss": 1.6460537910461426, + "eval_macro_f1": 68.10039104694555, + "eval_macro_precision": 70.49737286259203, + "eval_macro_recall": 66.37495253502834, + "eval_micro_f1": 87.5, + "eval_micro_precision": 87.5, + "eval_micro_recall": 87.5, + "eval_runtime": 4.834, + "eval_samples_per_second": 552.753, + "eval_steps_per_second": 34.547, + "step": 6750 + }, + { + "epoch": 18.013306719893546, + "grad_norm": 0.4641469419002533, + "learning_rate": 4.927088625222598e-06, + "loss": 0.0252, + "step": 6755 + }, + { + "epoch": 18.02661343978709, + "grad_norm": 0.00014642822497989982, + "learning_rate": 4.861714528365646e-06, + "loss": 0.0091, + "step": 6760 + }, + { + "epoch": 18.039920159680637, + "grad_norm": 3.572425339370966e-05, + "learning_rate": 4.796766227919857e-06, + "loss": 0.0282, + "step": 6765 + }, + { + "epoch": 18.053226879574186, + "grad_norm": 0.449663907289505, + "learning_rate": 4.7322440145647905e-06, + "loss": 0.0163, + "step": 6770 + }, + { + "epoch": 18.066533599467732, + "grad_norm": 0.6586659550666809, + "learning_rate": 4.668148177072984e-06, + "loss": 0.0358, + "step": 6775 + }, + { + "epoch": 18.079840319361278, + "grad_norm": 0.6322921514511108, + "learning_rate": 4.604479002308737e-06, + "loss": 0.0335, + "step": 6780 + }, + { + "epoch": 18.093147039254823, + "grad_norm": 7.740201544947922e-05, + "learning_rate": 4.541236775226809e-06, + "loss": 0.0344, + "step": 6785 + }, + { + "epoch": 18.10645375914837, + "grad_norm": 7.509323040721938e-05, + "learning_rate": 4.478421778871112e-06, + "loss": 0.0001, + "step": 6790 + }, + { + "epoch": 18.119760479041915, + "grad_norm": 0.0002992966619785875, + "learning_rate": 4.416034294373472e-06, + "loss": 0.0082, + "step": 6795 + }, + { + "epoch": 18.133067198935464, + "grad_norm": 0.2723856270313263, + "learning_rate": 4.354074600952407e-06, + "loss": 0.0689, + "step": 6800 + }, + { + "epoch": 18.14637391882901, + "grad_norm": 0.7682840824127197, + "learning_rate": 4.292542975911717e-06, + "loss": 0.0656, + "step": 6805 + }, + { + "epoch": 18.159680638722556, + "grad_norm": 0.5859367847442627, + "learning_rate": 4.231439694639483e-06, + "loss": 0.0403, + "step": 6810 + }, + { + "epoch": 18.1729873586161, + "grad_norm": 0.0007575666531920433, + "learning_rate": 4.170765030606638e-06, + "loss": 0.0266, + "step": 6815 + }, + { + "epoch": 18.186294078509647, + "grad_norm": 0.83262038230896, + "learning_rate": 4.110519255365852e-06, + "loss": 0.0619, + "step": 6820 + }, + { + "epoch": 18.199600798403193, + "grad_norm": 0.00012123231863370165, + "learning_rate": 4.050702638550275e-06, + "loss": 0.0531, + "step": 6825 + }, + { + "epoch": 18.21290751829674, + "grad_norm": 0.47834277153015137, + "learning_rate": 3.991315447872302e-06, + "loss": 0.029, + "step": 6830 + }, + { + "epoch": 18.226214238190288, + "grad_norm": 0.0005108633195050061, + "learning_rate": 3.932357949122445e-06, + "loss": 0.0282, + "step": 6835 + }, + { + "epoch": 18.239520958083833, + "grad_norm": 8.286790398415178e-05, + "learning_rate": 3.873830406168111e-06, + "loss": 0.0206, + "step": 6840 + }, + { + "epoch": 18.25282767797738, + "grad_norm": 0.4972341060638428, + "learning_rate": 3.815733080952411e-06, + "loss": 0.0435, + "step": 6845 + }, + { + "epoch": 18.266134397870925, + "grad_norm": 0.00017122157441917807, + "learning_rate": 3.758066233492952e-06, + "loss": 0.0351, + "step": 6850 + }, + { + "epoch": 18.27944111776447, + "grad_norm": 0.5052186250686646, + "learning_rate": 3.7008301218807716e-06, + "loss": 0.0492, + "step": 6855 + }, + { + "epoch": 18.292747837658016, + "grad_norm": 0.3930492401123047, + "learning_rate": 3.6440250022790834e-06, + "loss": 0.0149, + "step": 6860 + }, + { + "epoch": 18.306054557551562, + "grad_norm": 0.00020653626415878534, + "learning_rate": 3.5876511289222314e-06, + "loss": 0.029, + "step": 6865 + }, + { + "epoch": 18.31936127744511, + "grad_norm": 0.001036687521263957, + "learning_rate": 3.5317087541144377e-06, + "loss": 0.0627, + "step": 6870 + }, + { + "epoch": 18.332667997338657, + "grad_norm": 0.1497216373682022, + "learning_rate": 3.476198128228736e-06, + "loss": 0.0134, + "step": 6875 + }, + { + "epoch": 18.345974717232203, + "grad_norm": 0.003397771855816245, + "learning_rate": 3.4211194997058714e-06, + "loss": 0.0346, + "step": 6880 + }, + { + "epoch": 18.35928143712575, + "grad_norm": 0.001260089105926454, + "learning_rate": 3.3664731150531482e-06, + "loss": 0.0244, + "step": 6885 + }, + { + "epoch": 18.372588157019294, + "grad_norm": 0.49642452597618103, + "learning_rate": 3.3122592188433497e-06, + "loss": 0.0461, + "step": 6890 + }, + { + "epoch": 18.38589487691284, + "grad_norm": 0.519691526889801, + "learning_rate": 3.2584780537136207e-06, + "loss": 0.0808, + "step": 6895 + }, + { + "epoch": 18.39920159680639, + "grad_norm": 0.0005471158656291664, + "learning_rate": 3.2051298603643753e-06, + "loss": 0.0, + "step": 6900 + }, + { + "epoch": 18.412508316699935, + "grad_norm": 0.5794316530227661, + "learning_rate": 3.1522148775583016e-06, + "loss": 0.0759, + "step": 6905 + }, + { + "epoch": 18.42581503659348, + "grad_norm": 0.00023820818751119077, + "learning_rate": 3.099733342119171e-06, + "loss": 0.0187, + "step": 6910 + }, + { + "epoch": 18.439121756487026, + "grad_norm": 0.7367857694625854, + "learning_rate": 3.047685488930874e-06, + "loss": 0.0367, + "step": 6915 + }, + { + "epoch": 18.45242847638057, + "grad_norm": 0.7412865161895752, + "learning_rate": 2.996071550936319e-06, + "loss": 0.0626, + "step": 6920 + }, + { + "epoch": 18.465735196274117, + "grad_norm": 0.49905386567115784, + "learning_rate": 2.9448917591363925e-06, + "loss": 0.0273, + "step": 6925 + }, + { + "epoch": 18.479041916167663, + "grad_norm": 0.7662301063537598, + "learning_rate": 2.894146342588977e-06, + "loss": 0.0338, + "step": 6930 + }, + { + "epoch": 18.492348636061212, + "grad_norm": 7.189960888354108e-05, + "learning_rate": 2.8438355284078234e-06, + "loss": 0.0461, + "step": 6935 + }, + { + "epoch": 18.505655355954758, + "grad_norm": 0.00035782987833954394, + "learning_rate": 2.793959541761659e-06, + "loss": 0.0098, + "step": 6940 + }, + { + "epoch": 18.518962075848304, + "grad_norm": 0.0002716764574870467, + "learning_rate": 2.744518605873092e-06, + "loss": 0.0429, + "step": 6945 + }, + { + "epoch": 18.53226879574185, + "grad_norm": 0.5008606314659119, + "learning_rate": 2.6955129420176196e-06, + "loss": 0.0341, + "step": 6950 + }, + { + "epoch": 18.545575515635395, + "grad_norm": 0.6177487969398499, + "learning_rate": 2.6469427695226757e-06, + "loss": 0.0268, + "step": 6955 + }, + { + "epoch": 18.55888223552894, + "grad_norm": 9.384102304466069e-05, + "learning_rate": 2.5988083057666533e-06, + "loss": 0.0555, + "step": 6960 + }, + { + "epoch": 18.57218895542249, + "grad_norm": 0.00016950270219240338, + "learning_rate": 2.5511097661778503e-06, + "loss": 0.0441, + "step": 6965 + }, + { + "epoch": 18.585495675316036, + "grad_norm": 0.5722589492797852, + "learning_rate": 2.5038473642336137e-06, + "loss": 0.0268, + "step": 6970 + }, + { + "epoch": 18.59880239520958, + "grad_norm": 0.5383985042572021, + "learning_rate": 2.4570213114592954e-06, + "loss": 0.0515, + "step": 6975 + }, + { + "epoch": 18.612109115103127, + "grad_norm": 0.00039173016557469964, + "learning_rate": 2.410631817427378e-06, + "loss": 0.0372, + "step": 6980 + }, + { + "epoch": 18.625415834996673, + "grad_norm": 0.6350005269050598, + "learning_rate": 2.3646790897564943e-06, + "loss": 0.0265, + "step": 6985 + }, + { + "epoch": 18.63872255489022, + "grad_norm": 0.00014799917698837817, + "learning_rate": 2.3191633341104856e-06, + "loss": 0.0177, + "step": 6990 + }, + { + "epoch": 18.652029274783764, + "grad_norm": 0.0001857498282333836, + "learning_rate": 2.274084754197514e-06, + "loss": 0.0408, + "step": 6995 + }, + { + "epoch": 18.665335994677314, + "grad_norm": 0.00058632658328861, + "learning_rate": 2.2294435517691503e-06, + "loss": 0.0175, + "step": 7000 + }, + { + "epoch": 18.67864271457086, + "grad_norm": 0.0031339959241449833, + "learning_rate": 2.1852399266194314e-06, + "loss": 0.0115, + "step": 7005 + }, + { + "epoch": 18.691949434464405, + "grad_norm": 8.091120253084227e-05, + "learning_rate": 2.141474076584038e-06, + "loss": 0.0221, + "step": 7010 + }, + { + "epoch": 18.70525615435795, + "grad_norm": 0.013986263424158096, + "learning_rate": 2.098146197539319e-06, + "loss": 0.0082, + "step": 7015 + }, + { + "epoch": 18.718562874251496, + "grad_norm": 0.5572426319122314, + "learning_rate": 2.05525648340148e-06, + "loss": 0.0353, + "step": 7020 + }, + { + "epoch": 18.731869594145042, + "grad_norm": 7.861760241212323e-05, + "learning_rate": 2.0128051261257164e-06, + "loss": 0.0, + "step": 7025 + }, + { + "epoch": 18.745176314038588, + "grad_norm": 6.267589196795598e-05, + "learning_rate": 1.9707923157052833e-06, + "loss": 0.0348, + "step": 7030 + }, + { + "epoch": 18.758483033932137, + "grad_norm": 0.5337706804275513, + "learning_rate": 1.9292182401707603e-06, + "loss": 0.0535, + "step": 7035 + }, + { + "epoch": 18.771789753825683, + "grad_norm": 0.5486602783203125, + "learning_rate": 1.8880830855891096e-06, + "loss": 0.0181, + "step": 7040 + }, + { + "epoch": 18.78509647371923, + "grad_norm": 0.42544683814048767, + "learning_rate": 1.847387036062853e-06, + "loss": 0.0521, + "step": 7045 + }, + { + "epoch": 18.798403193612774, + "grad_norm": 0.553593099117279, + "learning_rate": 1.8071302737293295e-06, + "loss": 0.0457, + "step": 7050 + }, + { + "epoch": 18.81170991350632, + "grad_norm": 0.00020910767489112914, + "learning_rate": 1.7673129787598054e-06, + "loss": 0.0242, + "step": 7055 + }, + { + "epoch": 18.825016633399866, + "grad_norm": 0.00025782870943658054, + "learning_rate": 1.7279353293586765e-06, + "loss": 0.0356, + "step": 7060 + }, + { + "epoch": 18.83832335329341, + "grad_norm": 7.453611033270136e-05, + "learning_rate": 1.6889975017626903e-06, + "loss": 0.035, + "step": 7065 + }, + { + "epoch": 18.85163007318696, + "grad_norm": 0.6055145859718323, + "learning_rate": 1.6504996702401243e-06, + "loss": 0.0375, + "step": 7070 + }, + { + "epoch": 18.864936793080506, + "grad_norm": 0.5281118750572205, + "learning_rate": 1.6124420070900758e-06, + "loss": 0.0345, + "step": 7075 + }, + { + "epoch": 18.878243512974052, + "grad_norm": 0.5610820055007935, + "learning_rate": 1.574824682641629e-06, + "loss": 0.0539, + "step": 7080 + }, + { + "epoch": 18.891550232867598, + "grad_norm": 1.1242096424102783, + "learning_rate": 1.537647865253089e-06, + "loss": 0.0472, + "step": 7085 + }, + { + "epoch": 18.904856952761143, + "grad_norm": 1.0025428533554077, + "learning_rate": 1.50091172131126e-06, + "loss": 0.0549, + "step": 7090 + }, + { + "epoch": 18.91816367265469, + "grad_norm": 0.46737486124038696, + "learning_rate": 1.4646164152307018e-06, + "loss": 0.0545, + "step": 7095 + }, + { + "epoch": 18.93147039254824, + "grad_norm": 0.5139942765235901, + "learning_rate": 1.4287621094529524e-06, + "loss": 0.0264, + "step": 7100 + }, + { + "epoch": 18.944777112441784, + "grad_norm": 3.634204404079355e-05, + "learning_rate": 1.3933489644458619e-06, + "loss": 0.0, + "step": 7105 + }, + { + "epoch": 18.95808383233533, + "grad_norm": 1.1411399841308594, + "learning_rate": 1.3583771387028265e-06, + "loss": 0.0495, + "step": 7110 + }, + { + "epoch": 18.971390552228875, + "grad_norm": 0.0005110076745040715, + "learning_rate": 1.323846788742078e-06, + "loss": 0.0249, + "step": 7115 + }, + { + "epoch": 18.98469727212242, + "grad_norm": 0.8430808186531067, + "learning_rate": 1.2897580691060506e-06, + "loss": 0.0498, + "step": 7120 + }, + { + "epoch": 18.998003992015967, + "grad_norm": 9.565344225848094e-05, + "learning_rate": 1.2561111323605712e-06, + "loss": 0.0094, + "step": 7125 + }, + { + "epoch": 18.998003992015967, + "eval_loss": 1.6504560708999634, + "eval_macro_f1": 68.05326783992479, + "eval_macro_precision": 70.41695830446321, + "eval_macro_recall": 66.35973183487613, + "eval_micro_f1": 87.4625748502994, + "eval_micro_precision": 87.4625748502994, + "eval_micro_recall": 87.4625748502994, + "eval_runtime": 4.4506, + "eval_samples_per_second": 600.367, + "eval_steps_per_second": 37.523, + "step": 7125 + }, + { + "epoch": 19.013306719893546, + "grad_norm": 4.465243182494305e-05, + "learning_rate": 1.2229061290942922e-06, + "loss": 0.0366, + "step": 7130 + }, + { + "epoch": 19.02661343978709, + "grad_norm": 0.0008600103319622576, + "learning_rate": 1.1901432079179709e-06, + "loss": 0.0187, + "step": 7135 + }, + { + "epoch": 19.039920159680637, + "grad_norm": 4.5936063543194905e-05, + "learning_rate": 1.157822515463758e-06, + "loss": 0.0644, + "step": 7140 + }, + { + "epoch": 19.053226879574186, + "grad_norm": 0.1492280811071396, + "learning_rate": 1.1259441963846429e-06, + "loss": 0.0575, + "step": 7145 + }, + { + "epoch": 19.066533599467732, + "grad_norm": 8.396996418014169e-05, + "learning_rate": 1.0945083933537103e-06, + "loss": 0.0452, + "step": 7150 + }, + { + "epoch": 19.079840319361278, + "grad_norm": 0.4817587435245514, + "learning_rate": 1.0635152470635512e-06, + "loss": 0.0221, + "step": 7155 + }, + { + "epoch": 19.093147039254823, + "grad_norm": 8.231854008045048e-05, + "learning_rate": 1.0329648962256411e-06, + "loss": 0.0077, + "step": 7160 + }, + { + "epoch": 19.10645375914837, + "grad_norm": 5.4089796321932226e-05, + "learning_rate": 1.002857477569663e-06, + "loss": 0.0174, + "step": 7165 + }, + { + "epoch": 19.119760479041915, + "grad_norm": 0.20306649804115295, + "learning_rate": 9.731931258429638e-07, + "loss": 0.024, + "step": 7170 + }, + { + "epoch": 19.133067198935464, + "grad_norm": 0.4647962152957916, + "learning_rate": 9.439719738099317e-07, + "loss": 0.0156, + "step": 7175 + }, + { + "epoch": 19.14637391882901, + "grad_norm": 5.901711483602412e-05, + "learning_rate": 9.15194152251353e-07, + "loss": 0.0333, + "step": 7180 + }, + { + "epoch": 19.159680638722556, + "grad_norm": 0.00029276739223860204, + "learning_rate": 8.868597899638898e-07, + "loss": 0.0171, + "step": 7185 + }, + { + "epoch": 19.1729873586161, + "grad_norm": 0.46516790986061096, + "learning_rate": 8.589690137594697e-07, + "loss": 0.0254, + "step": 7190 + }, + { + "epoch": 19.186294078509647, + "grad_norm": 0.00011026985885109752, + "learning_rate": 8.315219484647308e-07, + "loss": 0.0425, + "step": 7195 + }, + { + "epoch": 19.199600798403193, + "grad_norm": 5.936959132668562e-05, + "learning_rate": 8.04518716920466e-07, + "loss": 0.0362, + "step": 7200 + }, + { + "epoch": 19.21290751829674, + "grad_norm": 3.622892836574465e-05, + "learning_rate": 7.779594399810685e-07, + "loss": 0.0082, + "step": 7205 + }, + { + "epoch": 19.226214238190288, + "grad_norm": 0.00019010104006156325, + "learning_rate": 7.518442365139766e-07, + "loss": 0.0529, + "step": 7210 + }, + { + "epoch": 19.239520958083833, + "grad_norm": 6.970770482439548e-05, + "learning_rate": 7.261732233991513e-07, + "loss": 0.0409, + "step": 7215 + }, + { + "epoch": 19.25282767797738, + "grad_norm": 5.463602428790182e-05, + "learning_rate": 7.009465155285777e-07, + "loss": 0.048, + "step": 7220 + }, + { + "epoch": 19.266134397870925, + "grad_norm": 0.7579261660575867, + "learning_rate": 6.761642258056978e-07, + "loss": 0.0654, + "step": 7225 + }, + { + "epoch": 19.27944111776447, + "grad_norm": 0.48954853415489197, + "learning_rate": 6.518264651449779e-07, + "loss": 0.0352, + "step": 7230 + }, + { + "epoch": 19.292747837658016, + "grad_norm": 3.727992225321941e-05, + "learning_rate": 6.279333424713429e-07, + "loss": 0.0436, + "step": 7235 + }, + { + "epoch": 19.306054557551562, + "grad_norm": 4.1896390030160546e-05, + "learning_rate": 6.044849647197093e-07, + "loss": 0.0293, + "step": 7240 + }, + { + "epoch": 19.31936127744511, + "grad_norm": 0.5448512434959412, + "learning_rate": 5.814814368345412e-07, + "loss": 0.0343, + "step": 7245 + }, + { + "epoch": 19.332667997338657, + "grad_norm": 0.5785821676254272, + "learning_rate": 5.589228617693288e-07, + "loss": 0.0362, + "step": 7250 + }, + { + "epoch": 19.345974717232203, + "grad_norm": 0.547781765460968, + "learning_rate": 5.368093404861774e-07, + "loss": 0.0145, + "step": 7255 + }, + { + "epoch": 19.35928143712575, + "grad_norm": 3.6774767067981884e-05, + "learning_rate": 5.151409719553079e-07, + "loss": 0.0167, + "step": 7260 + }, + { + "epoch": 19.372588157019294, + "grad_norm": 5.5066615459509194e-05, + "learning_rate": 4.939178531546462e-07, + "loss": 0.0351, + "step": 7265 + }, + { + "epoch": 19.38589487691284, + "grad_norm": 4.909703784505837e-05, + "learning_rate": 4.731400790693785e-07, + "loss": 0.0171, + "step": 7270 + }, + { + "epoch": 19.39920159680639, + "grad_norm": 0.5192339420318604, + "learning_rate": 4.5280774269154115e-07, + "loss": 0.0357, + "step": 7275 + }, + { + "epoch": 19.412508316699935, + "grad_norm": 0.5900948643684387, + "learning_rate": 4.329209350195651e-07, + "loss": 0.0571, + "step": 7280 + }, + { + "epoch": 19.42581503659348, + "grad_norm": 1.5432310104370117, + "learning_rate": 4.1347974505789867e-07, + "loss": 0.0472, + "step": 7285 + }, + { + "epoch": 19.439121756487026, + "grad_norm": 0.0005136824329383671, + "learning_rate": 3.9448425981661876e-07, + "loss": 0.0274, + "step": 7290 + }, + { + "epoch": 19.45242847638057, + "grad_norm": 0.6122019290924072, + "learning_rate": 3.7593456431103123e-07, + "loss": 0.0292, + "step": 7295 + }, + { + "epoch": 19.465735196274117, + "grad_norm": 0.0002864995039999485, + "learning_rate": 3.578307415612714e-07, + "loss": 0.0358, + "step": 7300 + }, + { + "epoch": 19.479041916167663, + "grad_norm": 1.156638503074646, + "learning_rate": 3.401728725919373e-07, + "loss": 0.047, + "step": 7305 + }, + { + "epoch": 19.492348636061212, + "grad_norm": 0.7358021140098572, + "learning_rate": 3.229610364317792e-07, + "loss": 0.0536, + "step": 7310 + }, + { + "epoch": 19.505655355954758, + "grad_norm": 0.0011500949040055275, + "learning_rate": 3.061953101132442e-07, + "loss": 0.0213, + "step": 7315 + }, + { + "epoch": 19.518962075848304, + "grad_norm": 0.4783131182193756, + "learning_rate": 2.898757686722542e-07, + "loss": 0.0264, + "step": 7320 + }, + { + "epoch": 19.53226879574185, + "grad_norm": 0.00014955556252971292, + "learning_rate": 2.7400248514776183e-07, + "loss": 0.0446, + "step": 7325 + }, + { + "epoch": 19.545575515635395, + "grad_norm": 0.0015204122755676508, + "learning_rate": 2.585755305814841e-07, + "loss": 0.0029, + "step": 7330 + }, + { + "epoch": 19.55888223552894, + "grad_norm": 3.2041556551121175e-05, + "learning_rate": 2.4359497401758024e-07, + "loss": 0.0165, + "step": 7335 + }, + { + "epoch": 19.57218895542249, + "grad_norm": 4.298180647310801e-05, + "learning_rate": 2.2906088250229664e-07, + "loss": 0.028, + "step": 7340 + }, + { + "epoch": 19.585495675316036, + "grad_norm": 4.960396108799614e-05, + "learning_rate": 2.1497332108375568e-07, + "loss": 0.0264, + "step": 7345 + }, + { + "epoch": 19.59880239520958, + "grad_norm": 7.29636667529121e-05, + "learning_rate": 2.0133235281156736e-07, + "loss": 0.0161, + "step": 7350 + }, + { + "epoch": 19.612109115103127, + "grad_norm": 0.5350423455238342, + "learning_rate": 1.8813803873659607e-07, + "loss": 0.0601, + "step": 7355 + }, + { + "epoch": 19.625415834996673, + "grad_norm": 2.966666579595767e-05, + "learning_rate": 1.753904379106941e-07, + "loss": 0.0095, + "step": 7360 + }, + { + "epoch": 19.63872255489022, + "grad_norm": 0.0019933367148041725, + "learning_rate": 1.630896073864352e-07, + "loss": 0.0165, + "step": 7365 + }, + { + "epoch": 19.652029274783764, + "grad_norm": 0.00011221903696423396, + "learning_rate": 1.5123560221681487e-07, + "loss": 0.0029, + "step": 7370 + }, + { + "epoch": 19.665335994677314, + "grad_norm": 0.0003375353990122676, + "learning_rate": 1.3982847545507271e-07, + "loss": 0.0397, + "step": 7375 + }, + { + "epoch": 19.67864271457086, + "grad_norm": 0.9430463910102844, + "learning_rate": 1.2886827815440372e-07, + "loss": 0.0417, + "step": 7380 + }, + { + "epoch": 19.691949434464405, + "grad_norm": 0.1029142215847969, + "learning_rate": 1.1835505936773628e-07, + "loss": 0.0372, + "step": 7385 + }, + { + "epoch": 19.70525615435795, + "grad_norm": 0.0002183911856263876, + "learning_rate": 1.0828886614754341e-07, + "loss": 0.0246, + "step": 7390 + }, + { + "epoch": 19.718562874251496, + "grad_norm": 0.5175012946128845, + "learning_rate": 9.866974354560965e-08, + "loss": 0.0408, + "step": 7395 + }, + { + "epoch": 19.731869594145042, + "grad_norm": 0.14726030826568604, + "learning_rate": 8.949773461282008e-08, + "loss": 0.0144, + "step": 7400 + }, + { + "epoch": 19.745176314038588, + "grad_norm": 6.141362973721698e-05, + "learning_rate": 8.077288039898267e-08, + "loss": 0.0249, + "step": 7405 + }, + { + "epoch": 19.758483033932137, + "grad_norm": 0.4466927647590637, + "learning_rate": 7.249521995263964e-08, + "loss": 0.022, + "step": 7410 + }, + { + "epoch": 19.771789753825683, + "grad_norm": 0.4848124384880066, + "learning_rate": 6.466479032091189e-08, + "loss": 0.0246, + "step": 7415 + }, + { + "epoch": 19.78509647371923, + "grad_norm": 0.5077440738677979, + "learning_rate": 5.7281626549277046e-08, + "loss": 0.0317, + "step": 7420 + }, + { + "epoch": 19.798403193612774, + "grad_norm": 0.0002426803985144943, + "learning_rate": 5.0345761681491746e-08, + "loss": 0.0332, + "step": 7425 + }, + { + "epoch": 19.81170991350632, + "grad_norm": 0.7584611177444458, + "learning_rate": 4.385722675936954e-08, + "loss": 0.0657, + "step": 7430 + }, + { + "epoch": 19.825016633399866, + "grad_norm": 0.4777810573577881, + "learning_rate": 3.781605082270323e-08, + "loss": 0.0693, + "step": 7435 + }, + { + "epoch": 19.83832335329341, + "grad_norm": 0.4914855659008026, + "learning_rate": 3.2222260909087196e-08, + "loss": 0.0568, + "step": 7440 + }, + { + "epoch": 19.85163007318696, + "grad_norm": 0.14886783063411713, + "learning_rate": 2.7075882053828605e-08, + "loss": 0.0375, + "step": 7445 + }, + { + "epoch": 19.864936793080506, + "grad_norm": 0.5256046056747437, + "learning_rate": 2.2376937289814158e-08, + "loss": 0.0281, + "step": 7450 + }, + { + "epoch": 19.878243512974052, + "grad_norm": 0.5002803802490234, + "learning_rate": 1.81254476474213e-08, + "loss": 0.0512, + "step": 7455 + }, + { + "epoch": 19.891550232867598, + "grad_norm": 0.5000232458114624, + "learning_rate": 1.4321432154418279e-08, + "loss": 0.0741, + "step": 7460 + }, + { + "epoch": 19.904856952761143, + "grad_norm": 0.00015142855409067124, + "learning_rate": 1.0964907835864235e-08, + "loss": 0.0367, + "step": 7465 + }, + { + "epoch": 19.91816367265469, + "grad_norm": 0.7019768357276917, + "learning_rate": 8.055889714064791e-09, + "loss": 0.0499, + "step": 7470 + }, + { + "epoch": 19.93147039254824, + "grad_norm": 4.4128122681286186e-05, + "learning_rate": 5.594390808494332e-09, + "loss": 0.035, + "step": 7475 + }, + { + "epoch": 19.944777112441784, + "grad_norm": 0.5177294015884399, + "learning_rate": 3.5804221357182976e-09, + "loss": 0.0339, + "step": 7480 + }, + { + "epoch": 19.95808383233533, + "grad_norm": 0.0001399219618178904, + "learning_rate": 2.0139927093487664e-09, + "loss": 0.0168, + "step": 7485 + }, + { + "epoch": 19.971390552228875, + "grad_norm": 0.8865404725074768, + "learning_rate": 8.951095400333564e-10, + "loss": 0.0492, + "step": 7490 + }, + { + "epoch": 19.98469727212242, + "grad_norm": 0.43994656205177307, + "learning_rate": 2.237776353886112e-10, + "loss": 0.0501, + "step": 7495 + }, + { + "epoch": 19.998003992015967, + "grad_norm": 0.00010475648014107719, + "learning_rate": 0.0, + "loss": 0.0174, + "step": 7500 + }, + { + "epoch": 19.998003992015967, + "eval_loss": 1.651909589767456, + "eval_macro_f1": 68.1809240430323, + "eval_macro_precision": 70.57937372214415, + "eval_macro_recall": 66.4831886249996, + "eval_micro_f1": 87.5, + "eval_micro_precision": 87.5, + "eval_micro_recall": 87.5, + "eval_runtime": 5.4153, + "eval_samples_per_second": 493.416, + "eval_steps_per_second": 30.838, + "step": 7500 + }, + { + "epoch": 19.998003992015967, + "step": 7500, + "total_flos": 1.5762278746161152e+16, + "train_loss": 0.23192051490644613, + "train_runtime": 3495.581, + "train_samples_per_second": 137.579, + "train_steps_per_second": 2.146 } ], "logging_steps": 5, - "max_steps": 1580, + "max_steps": 7500, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 50.0, @@ -2528,7 +10816,7 @@ "attributes": {} } }, - "total_flos": 8695327329615872.0, + "total_flos": 1.5762278746161152e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null